Skip to content

WORKING WITH LINKS - Extracting all links

Python
#!/usr/bin/env python3
"""WORKING WITH LINKS - Extracting all links"""
from bs4 import BeautifulSoup
html = """
<html><body>
<a href="/page1">Page 1</a>
<a href="https://example.com">External</a>
<a href="/page2" class="internal">Page 2</a>
<a href="mailto:test@example.com">Email</a>
</body></html>
"""
soup = BeautifulSoup(html, 'html.parser')
print("Working with Links:")
links = soup.find_all('a')
print(f"  Total links: {len(links)}")
for link in links:
    href = link.get('href')
    text = link.string
    print(f"    {text}: {href}")
internal = [a['href'] for a in soup.find_all('a') if a.get('href', '').startswith('/')]
print(f"  Internal links: {internal}")