Skip to content

WEB SCRAPING BASICS - Extracting data from HTML

Python
#!/usr/bin/env python3
"""WEB SCRAPING BASICS - Extracting data from HTML"""
import urllib.request
import re
print("Web Scraping Basics:")
url = "http://httpbin.org/html"
try:
    with urllib.request.urlopen(url) as response:
        html = response.read().decode('utf-8')
        # Extract title
        title_match = re.search(r'<title>(.*?)</title>', html)
        if title_match:
            print(f"  Title: {title_match.group(1)}")
        # Extract all links
        links = re.findall(r'href=["\']([^"\']+)["\']', html)
        print(f"  Found {len(links)} links")
        for link in links[:3]:
            print(f"    {link}")
except Exception as e:
    print(f"  Error: {e}")