From 3db58a84a5f2e0db067dab4abdfbf1620caf63fe Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Mon, 6 Jan 2025 15:43:21 +0200 Subject: [PATCH] fixed goozali --- src/scrapers/__init__.py | 3 ++ .../goozali/GoozaliScrapperComponent.py | 32 +++++++++++++------ src/scrapers/goozali/__init__.py | 22 ++++++------- .../goozali/model/GoozaliFullRequest.py | 8 ++--- src/scrapers/scraper_input.py | 4 ++- .../telegram_default_handler.py | 1 + 6 files changed, 43 insertions(+), 27 deletions(-) diff --git a/src/scrapers/__init__.py b/src/scrapers/__init__.py index a28f5ae..65e4a54 100644 --- a/src/scrapers/__init__.py +++ b/src/scrapers/__init__.py @@ -12,6 +12,7 @@ from jobs import ( Country, JobPost, ) +from model.User import User from .glassdoor import GlassdoorScraper from .google import GoogleJobsScraper from .goozali import GoozaliScraper @@ -30,6 +31,7 @@ class SalarySource(Enum): def scrape_jobs( site_name: str | list[str] | Site | list[Site] | None = None, + user: User = None, search_term: str | None = None, google_search_term: str | None = None, location: str | None = None, @@ -93,6 +95,7 @@ def scrape_jobs( country_enum = Country.from_string(country_indeed) scraper_input = ScraperInput( + user=user, site_type=get_site_type(), country=country_enum, search_term=search_term, diff --git a/src/scrapers/goozali/GoozaliScrapperComponent.py b/src/scrapers/goozali/GoozaliScrapperComponent.py index b1b1cbd..5025713 100644 --- a/src/scrapers/goozali/GoozaliScrapperComponent.py +++ b/src/scrapers/goozali/GoozaliScrapperComponent.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta -from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice,GoozaliFieldChoice +from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliFieldChoice from ..utils import create_logger # Mapping function to convert parsed dictionary into GoozaliResponseData @@ -13,12 +13,20 @@ class GoozaliScrapperComponent: pass # Function to filter GoozaliRows based on hours old - def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]: + def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, + column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]: return [ - row for row in rows - if row.cellValuesByColumnId[column.id] == column_choice.id + row + for row in rows + if row.cellValuesByColumnId.get(column.id) + and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices) ] + # return [ + # row for row in rows + # if row.cellValuesByColumnId[column.id] == column_choice.id + # ] + def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]: # Current time now = datetime.now() @@ -39,14 +47,20 @@ class GoozaliScrapperComponent: if (column.name == column_name): return column - def find_choice_from_column(self, column: GoozaliColumn, choice_name: GoozaliFieldChoice) -> GoozaliColumnChoice: + def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[ + GoozaliColumnChoice]: if not column.typeOptions.choices: logger.exception(f"Choices for column {column.name} doesn't exist") raise Exception(f"Choices for column {column.name} doesn't exist") + chosen_values = [c.value for c in choices] + goozali_column_choices = [] for key, choice in column.typeOptions.choices.items(): - if choice.name == choice_name.value: - return choice + if choice.name in chosen_values: + goozali_column_choices.append(choice) - logger.exception(f"Can't find {choice_name} for column {column.name}") - raise Exception(f"Can't find {choice_name} for column {column.name}") + if len(goozali_column_choices) == 0: + logger.exception(f"Can't find {choices} for column {column.name}") + raise Exception(f"Can't find {choices} for column {column.name}") + + return goozali_column_choices diff --git a/src/scrapers/goozali/__init__.py b/src/scrapers/goozali/__init__.py index 2a3f1b2..e2e2f34 100644 --- a/src/scrapers/goozali/__init__.py +++ b/src/scrapers/goozali/__init__.py @@ -7,8 +7,10 @@ This module contains routines to scrape Goozali. from __future__ import annotations -from model.User import User -from model.user_repository import user_repository +from jobs import ( + JobPost, + JobResponse, +) from .GoozaliMapper import GoozaliMapper from .GoozaliScrapperComponent import GoozaliScrapperComponent from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map @@ -16,14 +18,9 @@ from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, Goozal from ..scraper import Scraper from ..scraper_input import ScraperInput from ..site import Site - from ..utils import create_dict_by_key_and_value, create_session, create_logger -from jobs import ( - JobPost, - JobResponse, -) -logger = create_logger("Goozali") +logger = create_logger("GoozaliScraper") class GoozaliScraper(Scraper): @@ -82,12 +79,11 @@ class GoozaliScraper(Scraper): # filter result by Field column = self.component.find_column( goozali_response.data.columns, job_post_column_to_goozali_column["field"]) - user: User = user_repository.find_by_username() - user_goozali_field = position_to_goozali_field_map[user.position] - column_choice = self.component.find_choice_from_column( - column, user_goozali_field) + user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position] + column_choices = self.component.find_choices_from_column( + column, user_goozali_fields) filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice( - goozali_response.data.rows, column, column_choice) + goozali_response.data.rows, column, column_choices) filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours( filtered_rows_by_column_choice, scraper_input.hours_old) dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value( diff --git a/src/scrapers/goozali/model/GoozaliFullRequest.py b/src/scrapers/goozali/model/GoozaliFullRequest.py index 3387ed8..8c90193 100644 --- a/src/scrapers/goozali/model/GoozaliFullRequest.py +++ b/src/scrapers/goozali/model/GoozaliFullRequest.py @@ -6,13 +6,13 @@ class GoozaliFullRequest(): self.view_id: str = "viwIOzPYaUGxlA0Jd" self.url = base_url.format(view_id=self.view_id) self.application_id: str = "appwewqLk7iUY4azc" - self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s" + self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ" self.stringifiedObjectParams = { "shouldUseNestedResponseFormat": "true"} self.cookies: dict[str, str] = {} - self.request_id: str = "req4q4tKw3woEEWxw&" + self.request_id: str = "reqGjlEjOQFyRssam" self.share_id: str = "shrQBuWjXd0YgPqV6" - self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" + self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74" self.headers = self._generate_headers() self.params = self._generate_params() self.cookies = {} @@ -66,7 +66,7 @@ class GoozaliFullRequest(): "shareId": self.share_id, "applicationId": self.application_id, "generationNumber": 0, - "expires": "2025-01-02T00:00:00.000Z", + "expires": "2025-01-30T00:00:00.000Z", "signature": self.signature } # Convert to a JSON string diff --git a/src/scrapers/scraper_input.py b/src/scrapers/scraper_input.py index 9b3a183..381eec2 100644 --- a/src/scrapers/scraper_input.py +++ b/src/scrapers/scraper_input.py @@ -1,11 +1,13 @@ from pydantic import BaseModel from jobs import Country, JobType, DescriptionFormat +from model.User import User from scrapers.site import Site class ScraperInput(BaseModel): site_type: list[Site] + user: User search_term: str | None = None google_search_term: str | None = None @@ -22,4 +24,4 @@ class ScraperInput(BaseModel): description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN results_wanted: int = 15 - hours_old: int | None = None \ No newline at end of file + hours_old: int | None = None diff --git a/src/telegram_handler/telegram_default_handler.py b/src/telegram_handler/telegram_default_handler.py index 0266d4f..02bcf35 100644 --- a/src/telegram_handler/telegram_default_handler.py +++ b/src/telegram_handler/telegram_default_handler.py @@ -58,6 +58,7 @@ class TelegramDefaultHandler(TelegramHandler): f"Start scarping: {site_names_print}") filtered_out_jobs, jobs = scrape_jobs( site_name=self.sites_to_scrap, + user=user, search_term=user.position.value, locations=locations, results_wanted=200,