FILE ENCODING - Handling different text encodings
Python
#!/usr/bin/env python3
"""
FILE ENCODING - Handling different text encodings
Demonstrates working with UTF-8, ASCII, and other encodings
"""
import os
import tempfile
print("=" * 60)
print("FILE ENCODING - Text Encoding Handling")
print("=" * 60)
temp_dir = tempfile.gettempdir()
# Example 1: Default UTF-8 encoding
print("\n1. Default UTF-8 Encoding")
print("-" * 40)
utf8_file = os.path.join(temp_dir, "utf8.txt")
text = "Hello 世界 🌍" # English, Chinese, Emoji
with open(utf8_file, 'w', encoding='utf-8') as f:
f.write(text)
print(f"Wrote: {text}")
with open(utf8_file, 'r', encoding='utf-8') as f:
read_text = f.read()
print(f"Read: {read_text}")
# Example 2: Explicit encoding specification
print("\n2. Specifying Different Encodings")
print("-" * 40)
latin1_file = os.path.join(temp_dir, "latin1.txt")
text = "Café résumé naïve"
with open(latin1_file, 'w', encoding='latin-1') as f:
f.write(text)
print(f"Wrote with latin-1: {text}")
with open(latin1_file, 'r', encoding='latin-1') as f:
read_text = f.read()
print(f"Read with latin-1: {read_text}")
# Example 3: Encoding mismatch detection
print("\n3. What Happens with Wrong Encoding?")
print("-" * 40)
test_file = os.path.join(temp_dir, "test_encoding.txt")
# Write with UTF-8
with open(test_file, 'w', encoding='utf-8') as f:
f.write("Café")
# Try to read with ASCII (will fail on special chars)
try:
with open(test_file, 'r', encoding='ascii') as f:
content = f.read()
except UnicodeDecodeError as e:
print(f" Error with ASCII: {type(e).__name__}")
print(" ASCII can't decode 'é' character")
# Read with correct encoding
with open(test_file, 'r', encoding='utf-8') as f:
content = f.read()
print(f" Success with UTF-8: {content}")
# Example 4: Handling errors
print("\n4. Error Handling Strategies")
print("-" * 40)
problem_file = os.path.join(temp_dir, "problem.txt")
with open(problem_file, 'w', encoding='utf-8') as f:
f.write("Special chars: é ñ ü")
# Strategy 1: ignore errors
with open(problem_file, 'r', encoding='ascii', errors='ignore') as f:
content = f.read()
print(f" Ignore errors: '{content}'")
# Strategy 2: replace errors
with open(problem_file, 'r', encoding='ascii', errors='replace') as f:
content = f.read()
print(f" Replace errors: '{content}'")
# Strategy 3: backslashreplace
with open(problem_file, 'r', encoding='ascii', errors='backslashreplace') as f:
content = f.read()
print(f" Backslash replace: '{content}'")
# Example 5: Common encodings
print("\n5. Testing Common Encodings")
print("-" * 40)
encodings_to_test = ['utf-8', 'utf-16', 'utf-32', 'ascii']
for encoding in encodings_to_test:
test_file = os.path.join(temp_dir, f"{encoding}_test.txt")
text = "Hello World"
try:
with open(test_file, 'w', encoding=encoding) as f:
f.write(text)
file_size = os.path.getsize(test_file)
print(f" {encoding:10s}: {file_size:3d} bytes")
os.remove(test_file)
except Exception as e:
print(f" {encoding:10s}: Error - {e}")
# Example 6: Reading with unknown encoding
print("\n6. Detecting File Content")
print("-" * 40)
unknown_file = os.path.join(temp_dir, "unknown.txt")
with open(unknown_file, 'w', encoding='utf-8') as f:
f.write("This is a test file")
# Read as binary first to inspect
with open(unknown_file, 'rb') as f:
raw_bytes = f.read(20)
print(f" First 20 bytes: {raw_bytes}")
print(f" As UTF-8: {raw_bytes.decode('utf-8')}")
# Example 7: Converting between encodings
print("\n7. Converting File Encoding")
print("-" * 40)
source_file = os.path.join(temp_dir, "source_utf8.txt")
dest_file = os.path.join(temp_dir, "dest_latin1.txt")
# Write UTF-8
with open(source_file, 'w', encoding='utf-8') as f:
f.write("Café résumé")
# Read UTF-8, write Latin-1
with open(source_file, 'r', encoding='utf-8') as src:
content = src.read()
with open(dest_file, 'w', encoding='latin-1') as dst:
dst.write(content)
print(" Converted UTF-8 → Latin-1")
# Verify
with open(dest_file, 'r', encoding='latin-1') as f:
print(f" Result: {f.read()}")
# Cleanup
for f in [utf8_file, latin1_file, test_file, problem_file,
unknown_file, source_file, dest_file]:
if os.path.exists(f):
os.remove(f)
print("\n" + "=" * 60)
print("Key Points:")
print(" - Always specify encoding explicitly")
print(" - UTF-8 is the modern standard")
print(" - Use errors='ignore'/'replace' for robustness")
print(" - Encoding affects file size")
print(" - Mismatched encoding causes errors")
print("=" * 60)