commit 37f75a90e5c69d1ee79d01f49e2ebb67a6ab793b Author: Namu Date: Sun Nov 16 01:39:14 2025 +0100 Feat: The crawler works on docker diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f10c67f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +# Jetbrains +.idea + +# Docker +Dockerfile +docker-compose.yml + +# Git +.gitignore + +# venv +.venv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ad082d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# jetbrains +.idea/ + +# venv +.venv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..91e5808 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +# Utilise une image Python officielle +FROM python:3.13-slim + + +# Copie le projet dans le conteneur +WORKDIR /app + +# Copie les dépendances du projet +COPY requirements.txt . + +# Installe les dépendances +RUN pip install --no-cache-dir -r requirements.txt + +COPY ./crawler2 . +COPY scrapy.cfg . + +# Commande pour lancer le spider +CMD ["scrapy", "crawl", "crawler_spider"] diff --git a/crawler2/__init__.py b/crawler2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler2/items.py b/crawler2/items.py new file mode 100644 index 0000000..e43c9d7 --- /dev/null +++ b/crawler2/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Crawler2Item(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/crawler2/middlewares.py b/crawler2/middlewares.py new file mode 100644 index 0000000..522849b --- /dev/null +++ b/crawler2/middlewares.py @@ -0,0 +1,100 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class Crawler2SpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # maching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class Crawler2DownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/crawler2/pipelines.py b/crawler2/pipelines.py new file mode 100644 index 0000000..946381f --- /dev/null +++ b/crawler2/pipelines.py @@ -0,0 +1,59 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import pymongo +from bs4 import BeautifulSoup + + +# crawler2/pipelines.py +class CleanTextPipeline: + def process_item(self, item, spider): + soup = BeautifulSoup(item["content"], "html.parser") + clean_text = soup.get_text(strip=True) + item["content"] = " ".join(clean_text.split()) + return item + + +class MongoDBPipeline: + def __init__(self, mongo_uri, mongo_db, mongo_collection, mongo_user, mongo_password): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + self.mongo_collection = mongo_collection + self.mongo_user = mongo_user + self.mongo_password = mongo_password + self.client = None + self.db = None + self.collection = None + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI", "mongodb://mongodb:27017"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "web_crawler"), + mongo_collection=crawler.settings.get("MONGO_COLLECTION", "pages"), + mongo_user=crawler.settings.get("MONGO_USER", "admin"), + mongo_password=crawler.settings.get("MONGO_PASSWORD", "Admin@123"), + ) + + def open_spider(self, spider): + # Utilise l'URI avec authentification + self.client = pymongo.MongoClient( + self.mongo_uri, + username=self.mongo_user, + password=self.mongo_password, + authSource="admin" # Base de données d'authentification + ) + self.db = self.client[self.mongo_db] + self.collection = self.db[self.mongo_collection] + + def process_item(self, item, spider): + if not self.collection.find_one({"url": item["url"]}): + self.collection.insert_one(dict(item)) + return item + + def close_spider(self, spider): + self.client.close() diff --git a/crawler2/settings.py b/crawler2/settings.py new file mode 100644 index 0000000..1a81759 --- /dev/null +++ b/crawler2/settings.py @@ -0,0 +1,105 @@ +# Scrapy settings for crawler2 project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +from scrapy.settings.default_settings import RANDOMIZE_DOWNLOAD_DELAY, RETRY_TIMES, RETRY_HTTP_CODES, \ + DOWNLOADER_MIDDLEWARES + +BOT_NAME = "crawler2" + +SPIDER_MODULES = ["crawler2.spiders"] +NEWSPIDER_MODULE = "crawler2.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (compatible; crawler2/+thomassazeratdev@gmail.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True +DOWNLOAD_DELAY = 1 +RANDOMIZE_DOWNLOAD_DELAY = True + +# Concurrency and throttling settings +CONCURRENT_REQUESTS = 4 +CONCURRENT_REQUESTS_PER_DOMAIN = 2 +CONCURRENT_REQUESTS_PER_IP = 2 + +# Errors handling +RETRY_TIMES = 3 +RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 408] + +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # Désactive l'user-agent par défaut + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 90, # Enable the retries +} + +ITEM_PIPELINES = { + 'crawler2.pipelines.CleanTextPipeline': 300, # Nettoyage des données + 'crawler2.pipelines.MongoDBPipeline': 300, # Sauvegarde sur Mongo +} + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "crawler2.middlewares.Crawler2SpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "crawler2.middlewares.Crawler2DownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "crawler2.pipelines.Crawler2Pipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/crawler2/spiders/__init__.py b/crawler2/spiders/__init__.py new file mode 100644 index 0000000..d942e4c --- /dev/null +++ b/crawler2/spiders/__init__.py @@ -0,0 +1,28 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. +import scrapy +from scrapy.linkextractors import LinkExtractor + + +class CrawlerSpider(scrapy.Spider): + name = "crawler_spider" + start_urls = ["https://quotes.toscrape.com"] + link_extractor = LinkExtractor( + deny_extensions=["pdf", "jpg", "png", "gif"], # Éviter les fichiers + canonicalize=True, # Normaliser les URLs + unique=True, # Éviter les doublons + ) + + def parse(self, response): + yield { + "url": response.url, + "title": response.css("title::text").get(), + "content": " ".join(response.css("p, h1, h2, h3, h4, h5, a::text").getall()), + "links": response.css("a::attr(href)").getall() + } + + for href in response.css("a::attr(href)").getall(): + if href.startswith("/") or self.start_urls[0] in href: + yield response.follow(href, callback=self.parse) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..a97a477 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,37 @@ +services: + mongodb: + image: mongo:8.0-noble + container_name: mongodb_crawler + restart: unless-stopped + environment: + MONGO_INITDB_ROOT_USERNAME: admin + MONGO_INITDB_ROOT_PASSWORD: Admin@123 + volumes: + - mongodb_data:/data/db + ports: + - "27017:27017" + networks: + - crawler_network + + crawler: + build: + context: . + dockerfile: Dockerfile + container_name: web_crawler + depends_on: + - mongodb + environment: + MONGO_URI: mongodb://mongodb:27017 + MONGO_DATABASE: web_pages + MONGO_COLLECTION: pages + volumes: + - .:/app + networks: + - crawler_network + +volumes: + mongodb_data: + +networks: + crawler_network: + driver: bridge diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2f43b07 Binary files /dev/null and b/requirements.txt differ diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..4681c96 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = crawler2.settings + +[deploy] +#url = http://localhost:6800/ +project = crawler2