Skip to content

SIMPLE WEB CRAWLER - Following links

Python
#!/usr/bin/env python3
"""SIMPLE WEB CRAWLER - Following links"""
import urllib.request
import urllib.parse
import re
print("Simple Web Crawler:")
def get_links(url):
    try:
        with urllib.request.urlopen(url, timeout=5) as response:
            html = response.read().decode('utf-8', errors='ignore')
            links = re.findall(r'href=["\']([^"\']+)["\']', html)
            return links
    except:
        return []
start_url = "http://httpbin.org/links/5"
print(f"  Starting at: {start_url}")
links = get_links(start_url)
print(f"  Found {len(links)} links:")
for i, link in enumerate(links[:10], 1):
    print(f"    {i}. {link}")