crawler2.0/crawler2/pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymongo
from bs4 import BeautifulSoup


# crawler2/pipelines.py
class CleanTextPipeline:
    def process_item(self, item, spider):
        soup = BeautifulSoup(item["content"], "html.parser")
        clean_text = soup.get_text(strip=True)
        item["content"] = " ".join(clean_text.split())
        return item


class MongoDBPipeline:
    def __init__(self, mongo_uri, mongo_db, mongo_collection, mongo_user, mongo_password):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.mongo_collection = mongo_collection
        self.mongo_user = mongo_user
        self.mongo_password = mongo_password
        self.client = None
        self.db = None
        self.collection = None

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get("MONGO_URI", "mongodb://mongodb:27017"),
            mongo_db=crawler.settings.get("MONGO_DATABASE", "web_crawler"),
            mongo_collection=crawler.settings.get("MONGO_COLLECTION", "pages"),
            mongo_user=crawler.settings.get("MONGO_USER", "admin"),
            mongo_password=crawler.settings.get("MONGO_PASSWORD", "Admin@123"),
        )

    def open_spider(self, spider):
        # Utilise l'URI avec authentification
        self.client = pymongo.MongoClient(
            self.mongo_uri,
            username=self.mongo_user,
            password=self.mongo_password,
            authSource="admin"  # Base de données d'authentification
        )
        self.db = self.client[self.mongo_db]
        self.collection = self.db[self.mongo_collection]

    def process_item(self, item, spider):
        if not self.collection.find_one({"url": item["url"]}):
            self.collection.insert_one(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()