Skip to content

FILE ENCODING - Handling different text encodings

Python
#!/usr/bin/env python3
"""
FILE ENCODING - Handling different text encodings
Demonstrates working with UTF-8, ASCII, and other encodings
"""

import os
import tempfile

print("=" * 60)
print("FILE ENCODING - Text Encoding Handling")
print("=" * 60)

temp_dir = tempfile.gettempdir()

# Example 1: Default UTF-8 encoding
print("\n1. Default UTF-8 Encoding")
print("-" * 40)
utf8_file = os.path.join(temp_dir, "utf8.txt")

text = "Hello 世界 🌍"  # English, Chinese, Emoji
with open(utf8_file, 'w', encoding='utf-8') as f:
    f.write(text)
    print(f"Wrote: {text}")

with open(utf8_file, 'r', encoding='utf-8') as f:
    read_text = f.read()
    print(f"Read: {read_text}")

# Example 2: Explicit encoding specification
print("\n2. Specifying Different Encodings")
print("-" * 40)
latin1_file = os.path.join(temp_dir, "latin1.txt")

text = "Café résumé naïve"
with open(latin1_file, 'w', encoding='latin-1') as f:
    f.write(text)
    print(f"Wrote with latin-1: {text}")

with open(latin1_file, 'r', encoding='latin-1') as f:
    read_text = f.read()
    print(f"Read with latin-1: {read_text}")

# Example 3: Encoding mismatch detection
print("\n3. What Happens with Wrong Encoding?")
print("-" * 40)
test_file = os.path.join(temp_dir, "test_encoding.txt")

# Write with UTF-8
with open(test_file, 'w', encoding='utf-8') as f:
    f.write("Café")

# Try to read with ASCII (will fail on special chars)
try:
    with open(test_file, 'r', encoding='ascii') as f:
        content = f.read()
except UnicodeDecodeError as e:
    print(f"  Error with ASCII: {type(e).__name__}")
    print("  ASCII can't decode 'é' character")

# Read with correct encoding
with open(test_file, 'r', encoding='utf-8') as f:
    content = f.read()
    print(f"  Success with UTF-8: {content}")

# Example 4: Handling errors
print("\n4. Error Handling Strategies")
print("-" * 40)
problem_file = os.path.join(temp_dir, "problem.txt")

with open(problem_file, 'w', encoding='utf-8') as f:
    f.write("Special chars: é ñ ü")

# Strategy 1: ignore errors
with open(problem_file, 'r', encoding='ascii', errors='ignore') as f:
    content = f.read()
    print(f"  Ignore errors: '{content}'")

# Strategy 2: replace errors
with open(problem_file, 'r', encoding='ascii', errors='replace') as f:
    content = f.read()
    print(f"  Replace errors: '{content}'")

# Strategy 3: backslashreplace
with open(problem_file, 'r', encoding='ascii', errors='backslashreplace') as f:
    content = f.read()
    print(f"  Backslash replace: '{content}'")

# Example 5: Common encodings
print("\n5. Testing Common Encodings")
print("-" * 40)
encodings_to_test = ['utf-8', 'utf-16', 'utf-32', 'ascii']

for encoding in encodings_to_test:
    test_file = os.path.join(temp_dir, f"{encoding}_test.txt")
    text = "Hello World"

    try:
        with open(test_file, 'w', encoding=encoding) as f:
            f.write(text)

        file_size = os.path.getsize(test_file)
        print(f"  {encoding:10s}: {file_size:3d} bytes")

        os.remove(test_file)
    except Exception as e:
        print(f"  {encoding:10s}: Error - {e}")

# Example 6: Reading with unknown encoding
print("\n6. Detecting File Content")
print("-" * 40)
unknown_file = os.path.join(temp_dir, "unknown.txt")

with open(unknown_file, 'w', encoding='utf-8') as f:
    f.write("This is a test file")

# Read as binary first to inspect
with open(unknown_file, 'rb') as f:
    raw_bytes = f.read(20)
    print(f"  First 20 bytes: {raw_bytes}")
    print(f"  As UTF-8: {raw_bytes.decode('utf-8')}")

# Example 7: Converting between encodings
print("\n7. Converting File Encoding")
print("-" * 40)
source_file = os.path.join(temp_dir, "source_utf8.txt")
dest_file = os.path.join(temp_dir, "dest_latin1.txt")

# Write UTF-8
with open(source_file, 'w', encoding='utf-8') as f:
    f.write("Café résumé")

# Read UTF-8, write Latin-1
with open(source_file, 'r', encoding='utf-8') as src:
    content = src.read()
    with open(dest_file, 'w', encoding='latin-1') as dst:
        dst.write(content)

print("  Converted UTF-8 → Latin-1")

# Verify
with open(dest_file, 'r', encoding='latin-1') as f:
    print(f"  Result: {f.read()}")

# Cleanup
for f in [utf8_file, latin1_file, test_file, problem_file,
          unknown_file, source_file, dest_file]:
    if os.path.exists(f):
        os.remove(f)

print("\n" + "=" * 60)
print("Key Points:")
print("  - Always specify encoding explicitly")
print("  - UTF-8 is the modern standard")
print("  - Use errors='ignore'/'replace' for robustness")
print("  - Encoding affects file size")
print("  - Mismatched encoding causes errors")
print("=" * 60)