Feat: The crawler works on docker
This commit is contained in:
28
crawler2/spiders/__init__.py
Normal file
28
crawler2/spiders/__init__.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
import scrapy
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
class CrawlerSpider(scrapy.Spider):
|
||||
name = "crawler_spider"
|
||||
start_urls = ["https://quotes.toscrape.com"]
|
||||
link_extractor = LinkExtractor(
|
||||
deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers
|
||||
canonicalize=True, # Normaliser les URLs
|
||||
unique=True, # Éviter les doublons
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
yield {
|
||||
"url": response.url,
|
||||
"title": response.css("title::text").get(),
|
||||
"content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()),
|
||||
"links": response.css("a::attr(href)").getall()
|
||||
}
|
||||
|
||||
for href in response.css("a::attr(href)").getall():
|
||||
if href.startswith("/") or self.start_urls[0] in href:
|
||||
yield response.follow(href, callback=self.parse)
|
||||
Reference in New Issue
Block a user