WEB SCRAPING BASICS - Extracting data from HTML
Python
#!/usr/bin/env python3
"""WEB SCRAPING BASICS - Extracting data from HTML"""
import urllib.request
import re
print("Web Scraping Basics:")
url = "http://httpbin.org/html"
try:
with urllib.request.urlopen(url) as response:
html = response.read().decode('utf-8')
# Extract title
title_match = re.search(r'<title>(.*?)</title>', html)
if title_match:
print(f" Title: {title_match.group(1)}")
# Extract all links
links = re.findall(r'href=["\']([^"\']+)["\']', html)
print(f" Found {len(links)} links")
for link in links[:3]:
print(f" {link}")
except Exception as e:
print(f" Error: {e}")