Feat: The crawler works on docker
This commit is contained in:
59
crawler2/pipelines.py
Normal file
59
crawler2/pipelines.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
import pymongo
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# crawler2/pipelines.py
|
||||
class CleanTextPipeline:
|
||||
def process_item(self, item, spider):
|
||||
soup = BeautifulSoup(item["content"], "html.parser")
|
||||
clean_text = soup.get_text(strip=True)
|
||||
item["content"] = " ".join(clean_text.split())
|
||||
return item
|
||||
|
||||
|
||||
class MongoDBPipeline:
|
||||
def __init__(self, mongo_uri, mongo_db, mongo_collection, mongo_user, mongo_password):
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
self.mongo_collection = mongo_collection
|
||||
self.mongo_user = mongo_user
|
||||
self.mongo_password = mongo_password
|
||||
self.client = None
|
||||
self.db = None
|
||||
self.collection = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(
|
||||
mongo_uri=crawler.settings.get("MONGO_URI", "mongodb://mongodb:27017"),
|
||||
mongo_db=crawler.settings.get("MONGO_DATABASE", "web_crawler"),
|
||||
mongo_collection=crawler.settings.get("MONGO_COLLECTION", "pages"),
|
||||
mongo_user=crawler.settings.get("MONGO_USER", "admin"),
|
||||
mongo_password=crawler.settings.get("MONGO_PASSWORD", "Admin@123"),
|
||||
)
|
||||
|
||||
def open_spider(self, spider):
|
||||
# Utilise l'URI avec authentification
|
||||
self.client = pymongo.MongoClient(
|
||||
self.mongo_uri,
|
||||
username=self.mongo_user,
|
||||
password=self.mongo_password,
|
||||
authSource="admin" # Base de données d'authentification
|
||||
)
|
||||
self.db = self.client[self.mongo_db]
|
||||
self.collection = self.db[self.mongo_collection]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if not self.collection.find_one({"url": item["url"]}):
|
||||
self.collection.insert_one(dict(item))
|
||||
return item
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.client.close()
|
||||
Reference in New Issue
Block a user