Feat: The crawler works on docker

This commit is contained in:
Namu
2025-11-16 01:39:14 +01:00
commit 37f75a90e5
12 changed files with 387 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from scrapy.linkextractors import LinkExtractor
class CrawlerSpider(scrapy.Spider):
name = "crawler_spider"
start_urls = ["https://quotes.toscrape.com"]
link_extractor = LinkExtractor(
deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers
canonicalize=True, # Normaliser les URLs
unique=True, # Éviter les doublons
)
def parse(self, response):
yield {
"url": response.url,
"title": response.css("title::text").get(),
"content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()),
"links": response.css("a::attr(href)").getall()
}
for href in response.css("a::attr(href)").getall():
if href.startswith("/") or self.start_urls[0] in href:
yield response.follow(href, callback=self.parse)