From efea5e4597626e8fd1f922834b2cac8f181bc72b Mon Sep 17 00:00:00 2001 From: Furkan Melih Ercan Date: Sat, 9 Nov 2024 22:46:40 +0300 Subject: [PATCH] feat: comment task --- .../app/features/comments/db/model.py | 11 +++-- .../features/comments/services/comments.py | 29 +++++++++++- .../features/comments/services/extraction.py | 9 ++-- .../restaurants/domain/entity/yemek_sepeti.py | 44 +++++++++---------- .../restaurants/services/restaurant.py | 21 ++++++++- .../app/tasks/comments.py | 24 ++++++++-- 6 files changed, 100 insertions(+), 38 deletions(-) diff --git a/src/recommendation_engine/app/features/comments/db/model.py b/src/recommendation_engine/app/features/comments/db/model.py index 702565d..dab7f96 100644 --- a/src/recommendation_engine/app/features/comments/db/model.py +++ b/src/recommendation_engine/app/features/comments/db/model.py @@ -1,7 +1,8 @@ import uuid +import pendulum from sqlalchemy import Column from clickhouse_sqlalchemy import engines -from clickhouse_sqlalchemy.types import Int32, String, DateTime, Array +from clickhouse_sqlalchemy.types import Int32, String, DateTime, Array, Nullable from ....shared_kernel.database.clickhouse import ClickhouseBase @@ -10,15 +11,17 @@ class CommentsModel(ClickhouseBase): __tablename__ = "comments" id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4())) + provider = Column(String) rating = Column(Int32) comment = Column(String) comment_id = Column(String) replies = Column(Array(String)) like_count = Column(Int32) - created_at = Column(DateTime, nullable=True) - updated_at = Column(DateTime, nullable=True) + created_at = Column(Nullable(DateTime)) + updated_at = Column(Nullable(DateTime)) + version = Column(Int32, default=pendulum.now("Europe/Istanbul").int_timestamp) __table_args__ = ( - engines.MergeTree(order_by="id"), + engines.ReplacingMergeTree(order_by="id", version="version"), {"schema": "default"}, ) diff --git a/src/recommendation_engine/app/features/comments/services/comments.py b/src/recommendation_engine/app/features/comments/services/comments.py index 89a6c35..2fb40ef 100644 --- a/src/recommendation_engine/app/features/comments/services/comments.py +++ b/src/recommendation_engine/app/features/comments/services/comments.py @@ -1,3 +1,28 @@ +from ..db.model import CommentsModel +from ..dto.comment import CommentDto +from ....shared_kernel.generator import HashGenerator +from ....shared_kernel.database.clickhouse import get_session + + class CommentService: - def parse_all_comments(self, comments): - pass + @staticmethod + def parse_all_comments( + restaurant_id: str, provider: str, comments: list[CommentDto] + ): + with get_session() as session: + session.bulk_save_objects( + [ + CommentsModel( + provider=provider, + **comments[idx].model_dump(), + id=HashGenerator.generate_unique_hash( + [ + provider, + restaurant_id, + comments[idx].comment_id, + ] + ) + ) + for idx in range(len(comments)) + ] + ) diff --git a/src/recommendation_engine/app/features/comments/services/extraction.py b/src/recommendation_engine/app/features/comments/services/extraction.py index 3c72c5c..fdc4112 100644 --- a/src/recommendation_engine/app/features/comments/services/extraction.py +++ b/src/recommendation_engine/app/features/comments/services/extraction.py @@ -1,4 +1,3 @@ -from addict import Dict from typing import List from ..domain.entity.getir import GetirComments @@ -9,17 +8,17 @@ class CommentsExtractorService(Extractor): - def __init__(self, provider_type: Providers, **kwargs): - self.kwargs = Dict(**kwargs) + def __init__(self, provider_type: Providers, restaurant_id: str): + self.restaurant_id = restaurant_id self.provider = self.initialize_provider(provider_type) def initialize_provider( self, provider_type: Providers ) -> GetirComments | YemekSepetiComments: if provider_type == Providers.YEMEK_SEPETI: - return YemekSepetiComments(restaurant_id=self.kwargs.restaurant_id) + return YemekSepetiComments(restaurant_id=self.restaurant_id) elif provider_type == Providers.GETIR: - return GetirComments(restaurant_id=self.kwargs.restaurant_id) + return GetirComments(restaurant_id=self.restaurant_id) else: raise ValueError("Provider is not defined.") diff --git a/src/recommendation_engine/app/features/restaurants/domain/entity/yemek_sepeti.py b/src/recommendation_engine/app/features/restaurants/domain/entity/yemek_sepeti.py index e1b5876..7d6695a 100644 --- a/src/recommendation_engine/app/features/restaurants/domain/entity/yemek_sepeti.py +++ b/src/recommendation_engine/app/features/restaurants/domain/entity/yemek_sepeti.py @@ -12,28 +12,28 @@ class YemeksepetiRestaurants(BaseEntity, Processor): HEADERS = { - 'Referer': 'https://www.yemeksepeti.com/', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15', - 'Host': 'tr.fd-api.com', - 'Origin': 'https://www.yemeksepeti.com', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Site': 'cross-site', - 'Content-Length': '12639', - 'Connection': 'keep-alive', - 'Authorization': 'Bearer Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZXMiOlsxXSwiZXhwIjozNTE2Mjg4MTA1LCJzdWIiOiJ0dG11dWRAZ21haWwuY29tIiwiY29tcGFueV9pZGVudGlmaWVyIjoidHRfbXV1ZF91c2VyIiwiaXNfc2VydmljZSI6ZmFsc2V9.Io-7tX4TOYGBrPHvt9Gu1-L5fpy_tSE_t0w9s2w36is', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept': 'application/json, text/plain, */*', - 'Content-Type': 'application/json;charset=utf-8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Sec-Fetch-Mode': 'cors', - 'Request-Id': '05abe3bf-a5e1-42fa-8fe1-0c12f15b4d29', - 'X-FP-API-KEY': 'volo', - 'perseus-session-id': '1731008747698.395883673882935828.p3lf6jtoyy', - 'perseus-client-id': '1731008747697.711480228086526878.alpf6f0b2w', - 'Platform': 'web', - 'dps-session-id': 'eyJzZXNzaW9uX2lkIjoiN2Y1MTkwY2VmOWMxYjM3YjU2NjQ4ZDdkMDU5MjRiNTQiLCJwZXJzZXVzX2lkIjoiMTczMTAwODc0NzY5Ny43MTE0ODAyMjgwODY1MjY4NzguYWxwZjZmMGIydyIsInRpbWVzdGFtcCI6MTczMTAwODc1M30=', - 'App-Version': 'VENDOR-LIST-MICROFRONTEND.24.45.0049', - 'Cookie': '__cf_bm=aw2iUtJ8RDtNR8kQvZrEg6AUuQXgmY_BoRq_0ykKn5M-1731008926-1.0.1.1-63kS46LRbNymh7RoEiwBBCiSwwAvN18.pdu.5Jnh3o8Dl404cVbx0pFUnxAkQmiFG5IsUslAHWHTMbtXNHEytwNik5htDRhssfJm8xaAfu0; _pxhd=l5fq2P4NZH1lFoQbiUpz4t1P61fAQk2CDAULm8Yk-Gf3M49U8C9F5hNvT9ERPq-xuTfOinl2nlH6TvpNhtVPdg==:fPft/kKEenQomg/2ya/Uyx-Y8oD2pK8KJf/06XPMiXMjRsqn7leW2coQI5ahnU2WcsTGx9yyCkeTG2Td8hADp2mlGiYqpo/YRj9URss-6Cc=' + "Referer": "https://www.yemeksepeti.com/", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15", + "Host": "tr.fd-api.com", + "Origin": "https://www.yemeksepeti.com", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Site": "cross-site", + "Content-Length": "12639", + "Connection": "keep-alive", + "Authorization": "Bearer Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZXMiOlsxXSwiZXhwIjozNTE2Mjg4MTA1LCJzdWIiOiJ0dG11dWRAZ21haWwuY29tIiwiY29tcGFueV9pZGVudGlmaWVyIjoidHRfbXV1ZF91c2VyIiwiaXNfc2VydmljZSI6ZmFsc2V9.Io-7tX4TOYGBrPHvt9Gu1-L5fpy_tSE_t0w9s2w36is", + "Accept-Language": "en-US,en;q=0.9", + "Accept": "application/json, text/plain, */*", + "Content-Type": "application/json;charset=utf-8", + "Accept-Encoding": "gzip, deflate, br", + "Sec-Fetch-Mode": "cors", + "Request-Id": "05abe3bf-a5e1-42fa-8fe1-0c12f15b4d29", + "X-FP-API-KEY": "volo", + "perseus-session-id": "1731008747698.395883673882935828.p3lf6jtoyy", + "perseus-client-id": "1731008747697.711480228086526878.alpf6f0b2w", + "Platform": "web", + "dps-session-id": "eyJzZXNzaW9uX2lkIjoiN2Y1MTkwY2VmOWMxYjM3YjU2NjQ4ZDdkMDU5MjRiNTQiLCJwZXJzZXVzX2lkIjoiMTczMTAwODc0NzY5Ny43MTE0ODAyMjgwODY1MjY4NzguYWxwZjZmMGIydyIsInRpbWVzdGFtcCI6MTczMTAwODc1M30=", + "App-Version": "VENDOR-LIST-MICROFRONTEND.24.45.0049", + "Cookie": "__cf_bm=aw2iUtJ8RDtNR8kQvZrEg6AUuQXgmY_BoRq_0ykKn5M-1731008926-1.0.1.1-63kS46LRbNymh7RoEiwBBCiSwwAvN18.pdu.5Jnh3o8Dl404cVbx0pFUnxAkQmiFG5IsUslAHWHTMbtXNHEytwNik5htDRhssfJm8xaAfu0; _pxhd=l5fq2P4NZH1lFoQbiUpz4t1P61fAQk2CDAULm8Yk-Gf3M49U8C9F5hNvT9ERPq-xuTfOinl2nlH6TvpNhtVPdg==:fPft/kKEenQomg/2ya/Uyx-Y8oD2pK8KJf/06XPMiXMjRsqn7leW2coQI5ahnU2WcsTGx9yyCkeTG2Td8hADp2mlGiYqpo/YRj9URss-6Cc=", } def __init__(self, geo_value: GeoValue) -> None: diff --git a/src/recommendation_engine/app/features/restaurants/services/restaurant.py b/src/recommendation_engine/app/features/restaurants/services/restaurant.py index 00f266a..216dbfc 100644 --- a/src/recommendation_engine/app/features/restaurants/services/restaurant.py +++ b/src/recommendation_engine/app/features/restaurants/services/restaurant.py @@ -7,7 +7,11 @@ class RestaurantService: @staticmethod def parse_all_restaurants( - provider: str, restaurants: list[RestaurantDto], lat: float, lon: float, city: str + provider: str, + restaurants: list[RestaurantDto], + lat: float, + lon: float, + city: str, ): with get_session() as session: session.bulk_save_objects( @@ -29,3 +33,18 @@ def parse_all_restaurants( for idx in range(len(restaurants)) ] ) + + @staticmethod + def retrieve_restaurants_with_pagination( + provider: str, start: int = 0, page: int = 10 + ) -> list[RestaurantDto]: + with get_session() as session: + restaurants = ( + session.query(RestaurantModel) + .filter(RestaurantModel.provider == provider) + .offset(page * start) + .limit(page) + .all() + ) + + return [RestaurantDto(**restaurant.__dict__) for restaurant in restaurants] diff --git a/src/recommendation_engine/app/tasks/comments.py b/src/recommendation_engine/app/tasks/comments.py index 561546b..a0b9993 100644 --- a/src/recommendation_engine/app/tasks/comments.py +++ b/src/recommendation_engine/app/tasks/comments.py @@ -5,17 +5,33 @@ CommentService, CommentsExtractorService, ) +from ..shared_kernel.domain_providers import Providers +from ..features.restaurants.services import RestaurantService class CommentTask(Task): __name__ = "CommentTask" def run(self, *args, **kwargs): - comment_service = CommentService() - comment_extractor = CommentsExtractorService(**kwargs) + for provider in Providers: + counter = 0 + while True: + restaurants = RestaurantService.retrieve_restaurants_with_pagination( + provider=provider.value, start=counter, page=100 + ) + if len(restaurants) == 0: + break - comment_list = comment_extractor.crawl() - comment_service.parse_all_comments(comment_list) + for restaurant in restaurants: + comment_service = CommentService() + comment_extractor = CommentsExtractorService( + provider_type=provider, restaurant_id=restaurant.restaurant_id + ) + + comment_list = comment_extractor.crawl() + comment_service.parse_all_comments(comment_list) + + counter += 1 celery_application.register_task(CommentTask)