29 lines
1.0 KiB
Python
29 lines
1.0 KiB
Python
# This package will contain the spiders of your Scrapy project
|
|
#
|
|
# Please refer to the documentation for information on how to create and manage
|
|
# your spiders.
|
|
import scrapy
|
|
from scrapy.linkextractors import LinkExtractor
|
|
|
|
|
|
class CrawlerSpider(scrapy.Spider):
|
|
name = "crawler_spider"
|
|
start_urls = ["https://quotes.toscrape.com"]
|
|
link_extractor = LinkExtractor(
|
|
deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers
|
|
canonicalize=True, # Normaliser les URLs
|
|
unique=True, # Éviter les doublons
|
|
)
|
|
|
|
def parse(self, response):
|
|
yield {
|
|
"url": response.url,
|
|
"title": response.css("title::text").get(),
|
|
"content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()),
|
|
"links": response.css("a::attr(href)").getall()
|
|
}
|
|
|
|
for href in response.css("a::attr(href)").getall():
|
|
if href.startswith("/") or self.start_urls[0] in href:
|
|
yield response.follow(href, callback=self.parse)
|