Feat: The crawler works on docker

2025-11-16 01:39:14 +01:00
commit 37f75a90e5
12 changed files with 387 additions and 0 deletions
--- a/crawler2/spiders/init.py
+++ b/crawler2/spiders/init.py
@@ -0,0 +1,28 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
+import scrapy
+from scrapy.linkextractors import LinkExtractor
+
+
+class CrawlerSpider(scrapy.Spider):
+    name = "crawler_spider"
+    start_urls = ["https://quotes.toscrape.com"]
+    link_extractor = LinkExtractor(
+        deny_extensions=["pdf", "jpg", "png", "gif"],  # Éviter les fichiers
+        canonicalize=True,  # Normaliser les URLs
+        unique=True,  # Éviter les doublons
+    )
+
+    def parse(self, response):
+        yield {
+            "url": response.url,
+            "title": response.css("title::text").get(),
+            "content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()),
+            "links": response.css("a::attr(href)").getall()
+        }
+
+        for href in response.css("a::attr(href)").getall():
+            if href.startswith("/") or self.start_urls[0] in href:
+                yield response.follow(href, callback=self.parse)