crawler2.0/crawler2/spiders/__init__.py

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from scrapy.linkextractors import LinkExtractor


class CrawlerSpider(scrapy.Spider):
    name = "crawler_spider"
    start_urls = ["https://quotes.toscrape.com"]
    link_extractor = LinkExtractor(
        deny_extensions=["pdf", "jpg", "png", "gif"],  # Éviter les fichiers
        canonicalize=True,  # Normaliser les URLs
        unique=True,  # Éviter les doublons
    )

    def parse(self, response):
        yield {
            "url": response.url,
            "title": response.css("title::text").get(),
            "content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text, span::text, div::text").getall()),
            "links": response.css("a::attr(href)").getall()
        }

        for href in response.css("a::attr(href)").getall():
            if href.startswith("/") or self.start_urls[0] in href:
                yield response.follow(href, callback=self.parse)