Di postingan ini aku akan share sebuah script Python buat ngumpulin konten teks dari sebuah website secara otomatis.
Pastikan kamu udah install BeautifulSoup ya. Baru setelah itu jalankan script python ini:
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
from typing import Set, List, Dict
import hashlib
class AdvancedWebsiteScraper:
def __init__(self, base_url):
self.base_url = base_url
self.domain = urlparse(base_url).netloc
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept-Language': 'id,en-US;q=0.9,en;q=0.8,ar;q=0.7,fa;q=0.6'
})
# Data structures untuk manajemen URL
self.visited_urls: Set[str] = set()
self.urls_to_visit: Set[str] = {base_url}
self.post_urls: Set[str] = set()
self.avoid_patterns = [
r'\.(pdf|jpg|jpeg|png|gif|zip|rar|mp4|mp3|avi)$',
r'wp-content/uploads/',
r'feed/?$',
r'wp-json/',
r'\.php(\?|$)',
r'#.*$',
r'tag/',
r'category/'
]
# Stats
self.stats = {
'total_pages': 0,
'posts_scraped': 0,
'failed_pages': 0,
'start_time': None,
'end_time': None
}
def is_same_domain(self, url: str) -> bool:
"""Cek apakah URL masih dalam domain yang sama"""
try:
parsed = urlparse(url)
return parsed.netloc == self.domain or parsed.netloc == ''
except:
return False
def should_avoid(self, url: str) -> bool:
"""Cek apakah URL harus dihindari"""
for pattern in self.avoid_patterns:
if re.search(pattern, url, re.IGNORECASE):
return True
return False
def normalize_url(self, url: str) -> str:
"""Normalisasi URL untuk menghindari duplikat"""
# Hapus fragment, trailing slash, dan normalize
parsed = urlparse(url)
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"
if parsed.query:
# Urutkan query parameters untuk konsistensi
params = sorted(parsed.query.split('&'))
normalized += '?' + '&'.join(params)
return normalized
def extract_links(self, soup: BeautifulSoup, current_url: str) -> Set[str]:
"""Ekstrak semua link dari halaman"""
links = set()
for link in soup.find_all('a', href=True):
href = link['href']
# Skip empty, javascript, mailto, etc
if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
continue
# Buat URL absolut
absolute_url = urljoin(current_url, href)
normalized_url = self.normalize_url(absolute_url)
# Filter domain dan pattern
if (self.is_same_domain(normalized_url) and
not self.should_avoid(normalized_url)):
links.add(normalized_url)
return links
def is_article_page(self, soup: BeautifulSoup, url: str) -> bool:
"""Deteksi apakah halaman adalah artikel/post"""
# Cek berdasarkan URL pattern
url_lower = url.lower()
if any(pattern in url_lower for pattern in ['/page/', '/search', '/?s=']):
return False
# Cek berdasarkan struktur HTML
article_indicators = [
len(soup.find_all('article')) > 0,
soup.find('article') is not None,
soup.select_one('.post, .article, .entry-content, .post-content') is not None,
soup.find('main') is not None,
len(soup.find_all(['h1', 'h2'])) >= 2 # Artikel biasanya punya beberapa heading
]
# Cek apakah ada element waktu (tanggal post)
date_indicators = soup.select('.post-date, .entry-date, time, .date, [itemprop="datePublished"]')
return any(article_indicators) or len(date_indicators) > 0
def get_soup(self, url: str, max_retries: int = 3) -> BeautifulSoup:
"""Mengambil dan parse halaman web dengan retry"""
for attempt in range(max_retries):
try:
response = self.session.get(url, timeout=15)
response.encoding = 'utf-8'
response.raise_for_status()
# Cek content type
content_type = response.headers.get('content-type', '').lower()
if 'text/html' not in content_type:
print(f" ⓘ Bukan HTML: {content_type}")
return None
return BeautifulSoup(response.text, 'html.parser')
except requests.exceptions.RequestException as e:
print(f" ⚠ Attempt {attempt + 1}/{max_retries} failed for {url}: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
self.stats['failed_pages'] += 1
return None
except Exception as e:
print(f" ✗ Error parsing {url}: {e}")
self.stats['failed_pages'] += 1
return None
def extract_main_content(self, soup: BeautifulSoup, url: str) -> Dict:
"""Ekstrak konten utama dari halaman artikel"""
content = {
'title': '',
'body': '',
'date': '',
'url': url,
'hash': ''
}
# 1. Extract title dengan multiple strategies
title_selectors = [
'h1.entry-title', 'h1.post-title', 'h1.title',
'article h1', 'main h1', '.post-header h1',
'h1', 'title'
]
for selector in title_selectors:
title_elem = soup.select_one(selector)
if title_elem and title_elem.get_text(strip=True):
content['title'] = title_elem.get_text(strip=True)
break
# Fallback ke meta title
if not content['title'] and soup.title:
content['title'] = soup.title.get_text(strip=True)
# 2. Extract content dengan hierarchical approach
content_selectors = [
('article', 10), # Highest priority
('.post-content', 9),
('.entry-content', 9),
('.article-content', 9),
('main', 8),
('.post', 8),
('#content', 7),
('.content', 6),
('body', 5) # Last resort
]
main_content = None
main_score = 0
for selector, score in content_selectors:
elements = soup.select(selector) if selector != 'body' else [soup.body]
for elem in elements:
if elem:
# Hitung score berdasarkan panjang konten
text_length = len(elem.get_text(strip=True))
link_density = len(elem.find_all('a')) / max(1, len(elem.find_all(['p', 'div'])))
# Adjust score: lebih panjang & lebih sedikit link = lebih baik
adjusted_score = score + (text_length / 1000) - (link_density * 5)
if adjusted_score > main_score and text_length > 200:
main_content = elem
main_score = adjusted_score
if main_content:
# Hapus elemen yang tidak diinginkan
unwanted_selectors = [
'script', 'style', 'iframe', 'nav',
'.sidebar', '.widget', '.comments',
'.related-posts', '.share-buttons',
'footer', '.footer', 'header', '.header',
'.navigation', '.pagination', '.breadcrumb'
]
for selector in unwanted_selectors:
for elem in main_content.select(selector):
elem.decompose()
# Ekstrak teks terstruktur
text_parts = []
# Prioritaskan heading dan paragraf
for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'blockquote']):
text = elem.get_text(' ', strip=True)
if text and len(text) > 10: # Minimal 10 karakter
# Format heading
if elem.name.startswith('h'):
level = int(elem.name[1]) if len(elem.name) > 1 else 1
text_parts.append(f"\n{'#' * level} {text}\n")
else:
text_parts.append(text)
# Fallback: ambil semua teks jika struktur tidak jelas
if len(text_parts) < 3:
all_text = main_content.get_text(' ', strip=True)
# Split menjadi paragraf
paragraphs = re.split(r'\n\s*\n', all_text)
text_parts = [p for p in paragraphs if len(p.strip()) > 30]
content['body'] = '\n\n'.join(text_parts)
# 3. Extract date
date_selectors = [
'time[datetime]',
'.post-date',
'.entry-date',
'.date',
'[itemprop="datePublished"]',
'meta[property="article:published_time"]'
]
for selector in date_selectors:
elem = soup.select_one(selector)
if elem:
date_value = elem.get('datetime') or elem.get('content') or elem.get_text(strip=True)
if date_value:
content['date'] = date_value[:50] # Truncate jika terlalu panjang
break
if not content['date']:
content['date'] = datetime.now().strftime("%Y-%m-%d")
# 4. Generate hash untuk deteksi duplikat
content_str = f"{content['title']}{content['body'][:500]}"
content['hash'] = hashlib.md5(content_str.encode('utf-8')).hexdigest()[:16]
return content
def save_as_text(self, content: Dict, folder: str = 'scraped_posts') -> bool:
"""Simpan konten sebagai file teks dengan encoding UTF-8"""
os.makedirs(folder, exist_ok=True)
# Clean filename untuk multilingual support
title = content['title'] or 'untitled'
filename = re.sub(r'[<>:"/\\|?*]', '', title)
filename = re.sub(r'\s+', '_', filename)
filename = filename[:80] + '.txt' # Batasi panjang
filepath = os.path.join(folder, filename)
# Format konten
text_content = f"""=== METADATA ===
Title: {content['title']}
Date: {content['date']}
URL: {content['url']}
Hash: {content['hash']}
Scraped: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=== CONTENT ===
{content['body']}
=== END ===
"""
try:
# Cek duplikat berdasarkan hash
hash_file = os.path.join(folder, 'content_hashes.txt')
existing_hashes = set()
if os.path.exists(hash_file):
with open(hash_file, 'r', encoding='utf-8') as f:
existing_hashes = set(line.strip() for line in f)
if content['hash'] in existing_hashes:
print(f" ⓘ Duplikat ditemukan: {filename}")
return False
# Simpan konten
with open(filepath, 'w', encoding='utf-8') as f:
f.write(text_content)
# Update hash file
with open(hash_file, 'a', encoding='utf-8') as f:
f.write(content['hash'] + '\n')
print(f" ✓ Disimpan: {filename}")
return True
except Exception as e:
print(f" ✗ Error menyimpan {filename}: {e}")
return False
def crawl_page(self, url: str) -> bool:
"""Crawl satu halaman secara rekursif"""
if url in self.visited_urls:
return False
print(f"\n{'='*60}")
print(f"Halaman {self.stats['total_pages'] + 1}: {url}")
print(f"{'='*60}")
self.visited_urls.add(url)
self.stats['total_pages'] += 1
soup = self.get_soup(url)
if not soup:
return False
# Deteksi tipe halaman
if self.is_article_page(soup, url):
print(f" 📄 Deteksi: Artikel")
content = self.extract_main_content(soup, url)
if content and content['body'] and len(content['body']) > 100:
if self.save_as_text(content):
self.post_urls.add(url)
self.stats['posts_scraped'] += 1
else:
print(f" ⚠ Konten terlalu pendek atau tidak valid")
else:
print(f" 🔍 Deteksi: Halaman indeks/pagination")
# Ekstrak link untuk dilanjutkan
new_links = self.extract_links(soup, url)
# Filter out visited URLs
new_links = new_links - self.visited_urls - self.urls_to_visit
# Prioritaskan URL yang kemungkinan artikel
article_urls = []
other_urls = []
for link in new_links:
if any(pattern in link.lower() for pattern in ['/20', '/post/', '/article/', '/?p=']):
article_urls.append(link)
else:
other_urls.append(link)
# Tambahkan ke queue (artikel dulu, lalu lainnya)
self.urls_to_visit.update(article_urls)
self.urls_to_visit.update(other_urls)
print(f" 📊 Ditemukan: {len(new_links)} link baru")
print(f" 📋 Queue: {len(self.urls_to_visit)} URL menunggu")
return True
def run(self, max_pages: int = 100, max_posts: int = 50):
"""Jalankan scraping rekursif"""
print("=" * 70)
print("ADVANCED RECURSIVE WEBSITE SCRAPER")
print(f"Target: {self.base_url}")
print(f"Domain: {self.domain}")
print("=" * 70)
self.stats['start_time'] = datetime.now()
try:
while (self.urls_to_visit and
self.stats['total_pages'] < max_pages and
self.stats['posts_scraped'] < max_posts):
# Ambil URL berikutnya
current_url = self.urls_to_visit.pop()
# Crawl halaman
self.crawl_page(current_url)
# Delay untuk respect server
time.sleep(1)
# Progress update
if self.stats['total_pages'] % 10 == 0:
print(f"\n📊 Progress: {self.stats['total_pages']} halaman, "
f"{self.stats['posts_scraped']} artikel")
except KeyboardInterrupt:
print("\n⚠ Scraping dihentikan oleh user")
except Exception as e:
print(f"\n✗ Error dalam proses scraping: {e}")
self.stats['end_time'] = datetime.now()
self.generate_report()
def generate_report(self):
"""Generate laporan akhir"""
duration = self.stats['end_time'] - self.stats['start_time']
print("\n" + "=" * 70)
print("SCRAPING COMPLETE - FINAL REPORT")
print("=" * 70)
print(f"Domain: {self.domain}")
print(f"Waktu: {duration}")
print(f"Halaman dikunjungi: {self.stats['total_pages']}")
print(f"Artikel berhasil: {self.stats['posts_scraped']}")
print(f"Halaman gagal: {self.stats['failed_pages']}")
print(f"URL unik ditemukan: {len(self.visited_urls)}")
print(f"URL artikel ditemukan: {len(self.post_urls)}")
print("-" * 70)
print(f"Folder output: scraped_posts/")
print(f"Hash file: scraped_posts/content_hashes.txt")
print("=" * 70)
# Simpan daftar URL yang berhasil
if self.post_urls:
url_list_file = os.path.join('scraped_posts', 'scraped_urls.txt')
with open(url_list_file, 'w', encoding='utf-8') as f:
for url in sorted(self.post_urls):
f.write(url + '\n')
print(f"URL list saved: {url_list_file}")
# Main execution dengan konfigurasi
if __name__ == "__main__":
TARGET_URL = "https://websiteabc.com/"
# Buat scraper dengan konfigurasi
scraper = AdvancedWebsiteScraper(TARGET_URL)
# Konfigurasi scraping
MAX_PAGES = 3000 # Maksimal halaman yang dikunjungi
MAX_POSTS = 3000 # Maksimal artikel yang di-scrape
print(f"Konfigurasi:")
print(f" • Max halaman: {MAX_PAGES}")
print(f" • Max artikel: {MAX_POSTS}")
print(f" • Start URL: {TARGET_URL}")
# Jalankan scraper
scraper.run(max_pages=MAX_PAGES, max_posts=MAX_POSTS)