Files
crawler2.0/crawler2/spiders/__init__.py

29 lines
1.0 KiB
Python

# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
from scrapy.linkextractors import LinkExtractor
class CrawlerSpider(scrapy.Spider):
name = "crawler_spider"
start_urls = ["https://quotes.toscrape.com"]
link_extractor = LinkExtractor(
deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers
canonicalize=True, # Normaliser les URLs
unique=True, # Éviter les doublons
)
def parse(self, response):
yield {
"url": response.url,
"title": response.css("title::text").get(),
"content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text, span::text, div::text").getall()),
"links": response.css("a::attr(href)").getall()
}
for href in response.css("a::attr(href)").getall():
if href.startswith("/") or self.start_urls[0] in href:
yield response.follow(href, callback=self.parse)