This Python script scans a specified directory for media files, analyzes their audio streams using ffprobe (part of FFmpeg), and generates a report categorizing files based on their audio language. It separates files with non-English audio, other language audio (or no audio), and provides a detailed report of all audio streams. The script is designed to run on Debian 12.
Key Features
-
Comprehensive Language Detection: Identifies English (‘eng’), specific non-English languages, and undefined (‘und’) audio streams.
-
Recursive Scanning: Processes all media files in the specified directory and its subdirectories.
-
Detailed Reporting: Provides both a summary of non-English/undefined files and a detailed breakdown of all audio streams.
-
Robust Error Handling: Skips problematic files and continues processing, with clear error messages.
-
Customizable: Media file extensions can be modified in the media_extensions set.
Script Summary:
-
Uses ffprobe to analyze media file streams
-
Supports common video formats (.mp4, .mkv, .avi, .mov, .wmv, .flv, .m4v)
-
Recursively scans all subdirectories
-
Creates a text file (no_english_audio.txt) with results
-
Handles errors gracefully
-
Prompts for directory path (defaults to current directory if Enter is pressed)
The output file will contain:
-
A list of file paths for media files without English audio streams
-
Or a message indicating all files have English audio if none are found without it
Install Dependencies
First, install the required dependencies:
sudo apt update
sudo apt install ffmpeg python3
Save the script to a file (e.g., check_audio.py)
Script:
#!/usr/bin/env python3
import os
import subprocess
import json
import sys
from pathlib import Path
def get_audio_streams(file_path):
"""
Get detailed information about audio streams in a media file
Returns list of dictionaries containing stream info
"""
try:
cmd = [
'ffprobe',
'-v', 'error',
'-show_streams',
'-print_format', 'json',
str(file_path)
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
audio_streams = []
for stream in data.get('streams', []):
if stream.get('codec_type') == 'audio':
stream_info = {
'index': stream.get('index', 'unknown'),
'codec': stream.get('codec_name', 'unknown'),
'language': stream.get('tags', {}).get('language', 'und'),
'channels': stream.get('channels', 'unknown')
}
audio_streams.append(stream_info)
return audio_streams
except (subprocess.CalledProcessError, json.JSONDecodeError) as e:
print(f"Error processing {file_path}: {e}")
return []
def scan_directory(directory_path, output_file):
"""
Scan directory for media files and analyze audio stream languages
"""
media_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.m4v'}
directory = Path(directory_path)
if not directory.is_dir():
print(f"Error: {directory_path} is not a valid directory")
return
# Lists for results
no_english_files = []
undefined_lang_files = []
detailed_report = []
# Scan directory
for file_path in directory.rglob('*'):
if file_path.is_file() and file_path.suffix.lower() in media_extensions:
print(f"Analyzing: {file_path}")
audio_streams = get_audio_streams(file_path)
# Build detailed report entry
file_entry = f"File: {file_path}\n"
if audio_streams:
file_entry += f" Found {len(audio_streams)} audio stream(s):\n"
has_english = False
all_undefined = True
for stream in audio_streams:
lang = stream['language'].lower()
file_entry += f" Stream {stream['index']}: {lang} ({stream['codec']}, {stream['channels']} channels)\n"
if lang == 'eng':
has_english = True
if lang != 'und':
all_undefined = False
# Categorize the file
if not has_english:
if all_undefined:
undefined_lang_files.append(str(file_path))
else:
no_english_files.append(str(file_path))
else:
file_entry += " No audio streams found\n"
undefined_lang_files.append(str(file_path)) # Treat no audio as undefined
detailed_report.append(file_entry)
# Write results
try:
with open(output_file, 'w') as f:
# Summary of files without English (specific non-English languages)
f.write("=== Files With Non-English Audio (Excluding Undefined) ===\n")
if no_english_files:
f.write(f"Found {len(no_english_files)} file(s) with specific non-English audio:\n")
f.write("\n".join(no_english_files))
f.write("\n\n")
else:
f.write("No files found with specific non-English audio.\n\n")
# Summary of files with undefined language
f.write("=== Files With Undefined Language Audio (or No Audio) ===\n")
if undefined_lang_files:
f.write(f"Found {len(undefined_lang_files)} file(s) with undefined language audio:\n")
f.write("\n".join(undefined_lang_files))
f.write("\n\n")
else:
f.write("No files found with undefined language audio.\n\n")
# Detailed report
f.write("=== Detailed Audio Stream Report ===\n")
f.write("\n".join(detailed_report))
print(f"Results written to {output_file}")
except IOError as e:
print(f"Error writing to output file: {e}")
def main():
if len(sys.argv) != 2:
print("Usage: ./check_audio.py <directory_path>")
print("Example: ./check_audio.py /path/to/media")
sys.exit(1)
directory_path = sys.argv[1]
output_file = "audio_language_report.txt"
try:
subprocess.run(['ffprobe', '-version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: FFmpeg is not installed. Please install it using 'sudo apt install ffmpeg'")
sys.exit(1)
scan_directory(directory_path, output_file)
if __name__ == "__main__":
main()
Make the script executable:
chmod +x check_audio.py
Run the script:
./check_audio.py /path/to/media
Example output in audio_language_report.txt:
=== Files With Non-English Audio (Excluding Undefined) ===
Found 1 file(s) with specific non-English audio:
/path/to/video1.mkv
=== Files With Undefined Language Audio (or No Audio) ===
Found 2 file(s) with undefined language audio:
/path/to/video2.mp4
/path/to/video4.avi
=== Detailed Audio Stream Report ===
File: /path/to/video1.mkv
Found 1 audio stream(s):
Stream 1: spa (aac, 2 channels)
File: /path/to/video2.mp4
Found 1 audio stream(s):
Stream 1: und (mp3, 2 channels)
File: /path/to/video3.mkv
Found 2 audio stream(s):
Stream 1: eng (aac, 6 channels)
Stream 2: jpn (aac, 2 channels)
File: /path/to/video4.avi
Found 0 audio stream(s):
No audio streams found
Step-by-Step Breakdown
Script Initialization and Dependencies
-
The script uses Python 3, which is included with Debian 12.
-
Requires FFmpeg (ffprobe) to analyze media files. Install it with:
sudo apt update
sudo apt install ffmpeg
-
Imports necessary Python modules: os, subprocess, json, sys, and pathlib.Path.
-
The script is executed with a command-line argument specifying the directory to scan.
Command-Line Argument Handling
-
The script expects a single command-line argument: the path to the directory to scan.
-
Usage example:
./check_audio.py /path/to/media
-
If no or incorrect arguments are provided, it displays usage instructions and exits:
Usage: ./check_audio.py <directory_path>
Example: ./check_audio.py /path/to/media
-
The output report is saved to a file named audio_language_report.txt.
FFmpeg Availability Check
- Verifies that ffprobe is installed by running:
ffprobe -version
-
If FFmpeg is not installed, the script exits with an error message instructing the user to install it.
Audio Stream Analysis (get_audio_streams Function)
-
Uses ffprobe to extract stream information from a media file in JSON format.
-
Command executed:
ffprobe -v error -show_streams -print_format json <file_path>
-
Parses the JSON output to identify audio streams.
-
For each audio stream, collects:
-
Stream index
-
Codec name (e.g., aac, mp3)
-
Language tag (defaults to ‘und’ if undefined)
-
Number of channels
-
-
Returns a list of dictionaries containing stream details or an empty list if an error occurs (e.g., file corruption or invalid format).
Directory Scanning (scan_directory Function)
-
Accepts the directory path and output file name as parameters.
-
Supports common media file extensions: .mp4, .mkv, .avi, .mov, .wmv, .flv, .m4v.
-
Recursively scans the directory using Path.rglob to find all media files.
-
For each file:
-
Calls get_audio_streams to retrieve audio stream details.
-
Builds a detailed report entry listing all audio streams, including their language, codec, and channels.
-
Categorizes the file based on its audio streams:
-
Files with English audio: If any stream has language ‘eng’, the file is excluded from summary lists.
-
Files with non-English audio: If no ‘eng’ stream exists and at least one stream has a specific language (e.g., ‘spa’, ‘fre’), the file is added to no_english_files.
-
Files with undefined language or no audio: If all streams are ‘und’ (undefined) or no audio streams exist, the file is added to undefined_lang_files.
-
-
Output Generation
Writes results to audio_language_report.txt in three sections:
-
Files With Non-English Audio (Excluding Undefined):
-
Lists files with specific non-English languages (e.g., Spanish, French).
-
Example:
-
Found 1 file(s) with specific non-English audio:
/path/to/video1.mkv
- Files With Undefined Language Audio (or No Audio)
- Lists files with only ‘und’ language tags or no audio streams.
- Example:
Found 2 file(s) with undefined language audio:
/path/to/video2.mp4
/path/to/video4.avi
- Detailed Audio Stream Report:
-
Lists all files with their audio stream details.
-
Example:
-
File: /path/to/video1.mkv
Found 1 audio stream(s):
Stream 1: spa (aac, 2 channels)
File: /path/to/video2.mp4
Found 1 audio stream(s):
Stream 1: und (mp3, 2 channels)
File: /path/to/video3.mkv
Found 2 audio stream(s):
Stream 1: eng (aac, 6 channels)
Stream 2: jpn (aac, 2 channels)
File: /path/to/video4.avi
Found 0 audio stream(s):
No audio streams found
- Handles IO errors gracefully, printing an error message if the output file cannot be written.
Error Handling
-
Checks for valid directory input; exits if the directory is invalid.
-
Handles ffprobe errors (e.g., corrupted files) by skipping problematic files and logging errors.
-
Manages JSON parsing errors, ensuring the script continues processing other files.