# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface import pymongo from bs4 import BeautifulSoup # crawler2/pipelines.py class CleanTextPipeline: def process_item(self, item, spider): soup = BeautifulSoup(item["content"], "html.parser") clean_text = soup.get_text(strip=True) item["content"] = " ".join(clean_text.split()) return item class MongoDBPipeline: def __init__(self, mongo_uri, mongo_db, mongo_collection, mongo_user, mongo_password): self.mongo_uri = mongo_uri self.mongo_db = mongo_db self.mongo_collection = mongo_collection self.mongo_user = mongo_user self.mongo_password = mongo_password self.client = None self.db = None self.collection = None @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get("MONGO_URI", "mongodb://mongodb:27017"), mongo_db=crawler.settings.get("MONGO_DATABASE", "web_crawler"), mongo_collection=crawler.settings.get("MONGO_COLLECTION", "pages"), mongo_user=crawler.settings.get("MONGO_USER", "admin"), mongo_password=crawler.settings.get("MONGO_PASSWORD", "Admin@123"), ) def open_spider(self, spider): # Utilise l'URI avec authentification self.client = pymongo.MongoClient( self.mongo_uri, username=self.mongo_user, password=self.mongo_password, authSource="admin" # Base de données d'authentification ) self.db = self.client[self.mongo_db] self.collection = self.db[self.mongo_collection] def process_item(self, item, spider): if not self.collection.find_one({"url": item["url"]}): self.collection.insert_one(dict(item)) return item def close_spider(self, spider): self.client.close()