# This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. import scrapy from scrapy.linkextractors import LinkExtractor class CrawlerSpider(scrapy.Spider): name = "crawler_spider" start_urls = ["https://quotes.toscrape.com"] link_extractor = LinkExtractor( deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers canonicalize=True, # Normaliser les URLs unique=True, # Éviter les doublons ) def parse(self, response): yield { "url": response.url, "title": response.css("title::text").get(), "content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()), "links": response.css("a::attr(href)").getall() } for href in response.css("a::attr(href)").getall(): if href.startswith("/") or self.start_urls[0] in href: yield response.follow(href, callback=self.parse)