fixed goozali

pull/231/head
Yariv Menachem 2025-01-06 15:43:21 +02:00
parent 0d01789313
commit 3db58a84a5
6 changed files with 43 additions and 27 deletions

View File

@ -12,6 +12,7 @@ from jobs import (
Country, Country,
JobPost, JobPost,
) )
from model.User import User
from .glassdoor import GlassdoorScraper from .glassdoor import GlassdoorScraper
from .google import GoogleJobsScraper from .google import GoogleJobsScraper
from .goozali import GoozaliScraper from .goozali import GoozaliScraper
@ -30,6 +31,7 @@ class SalarySource(Enum):
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
user: User = None,
search_term: str | None = None, search_term: str | None = None,
google_search_term: str | None = None, google_search_term: str | None = None,
location: str | None = None, location: str | None = None,
@ -93,6 +95,7 @@ def scrape_jobs(
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
user=user,
site_type=get_site_type(), site_type=get_site_type(),
country=country_enum, country=country_enum,
search_term=search_term, search_term=search_term,

View File

@ -1,6 +1,6 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice,GoozaliFieldChoice from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliFieldChoice
from ..utils import create_logger from ..utils import create_logger
# Mapping function to convert parsed dictionary into GoozaliResponseData # Mapping function to convert parsed dictionary into GoozaliResponseData
@ -13,12 +13,20 @@ class GoozaliScrapperComponent:
pass pass
# Function to filter GoozaliRows based on hours old # Function to filter GoozaliRows based on hours old
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]: def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn,
column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]:
return [ return [
row for row in rows row
if row.cellValuesByColumnId[column.id] == column_choice.id for row in rows
if row.cellValuesByColumnId.get(column.id)
and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices)
] ]
# return [
# row for row in rows
# if row.cellValuesByColumnId[column.id] == column_choice.id
# ]
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]: def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
# Current time # Current time
now = datetime.now() now = datetime.now()
@ -39,14 +47,20 @@ class GoozaliScrapperComponent:
if (column.name == column_name): if (column.name == column_name):
return column return column
def find_choice_from_column(self, column: GoozaliColumn, choice_name: GoozaliFieldChoice) -> GoozaliColumnChoice: def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[
GoozaliColumnChoice]:
if not column.typeOptions.choices: if not column.typeOptions.choices:
logger.exception(f"Choices for column {column.name} doesn't exist") logger.exception(f"Choices for column {column.name} doesn't exist")
raise Exception(f"Choices for column {column.name} doesn't exist") raise Exception(f"Choices for column {column.name} doesn't exist")
chosen_values = [c.value for c in choices]
goozali_column_choices = []
for key, choice in column.typeOptions.choices.items(): for key, choice in column.typeOptions.choices.items():
if choice.name == choice_name.value: if choice.name in chosen_values:
return choice goozali_column_choices.append(choice)
logger.exception(f"Can't find {choice_name} for column {column.name}") if len(goozali_column_choices) == 0:
raise Exception(f"Can't find {choice_name} for column {column.name}") logger.exception(f"Can't find {choices} for column {column.name}")
raise Exception(f"Can't find {choices} for column {column.name}")
return goozali_column_choices

View File

@ -7,8 +7,10 @@ This module contains routines to scrape Goozali.
from __future__ import annotations from __future__ import annotations
from model.User import User from jobs import (
from model.user_repository import user_repository JobPost,
JobResponse,
)
from .GoozaliMapper import GoozaliMapper from .GoozaliMapper import GoozaliMapper
from .GoozaliScrapperComponent import GoozaliScrapperComponent from .GoozaliScrapperComponent import GoozaliScrapperComponent
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
@ -16,14 +18,9 @@ from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, Goozal
from ..scraper import Scraper from ..scraper import Scraper
from ..scraper_input import ScraperInput from ..scraper_input import ScraperInput
from ..site import Site from ..site import Site
from ..utils import create_dict_by_key_and_value, create_session, create_logger from ..utils import create_dict_by_key_and_value, create_session, create_logger
from jobs import (
JobPost,
JobResponse,
)
logger = create_logger("Goozali") logger = create_logger("GoozaliScraper")
class GoozaliScraper(Scraper): class GoozaliScraper(Scraper):
@ -82,12 +79,11 @@ class GoozaliScraper(Scraper):
# filter result by Field # filter result by Field
column = self.component.find_column( column = self.component.find_column(
goozali_response.data.columns, job_post_column_to_goozali_column["field"]) goozali_response.data.columns, job_post_column_to_goozali_column["field"])
user: User = user_repository.find_by_username() user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position]
user_goozali_field = position_to_goozali_field_map[user.position] column_choices = self.component.find_choices_from_column(
column_choice = self.component.find_choice_from_column( column, user_goozali_fields)
column, user_goozali_field)
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice( filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
goozali_response.data.rows, column, column_choice) goozali_response.data.rows, column, column_choices)
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours( filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
filtered_rows_by_column_choice, scraper_input.hours_old) filtered_rows_by_column_choice, scraper_input.hours_old)
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value( dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(

View File

@ -6,13 +6,13 @@ class GoozaliFullRequest():
self.view_id: str = "viwIOzPYaUGxlA0Jd" self.view_id: str = "viwIOzPYaUGxlA0Jd"
self.url = base_url.format(view_id=self.view_id) self.url = base_url.format(view_id=self.view_id)
self.application_id: str = "appwewqLk7iUY4azc" self.application_id: str = "appwewqLk7iUY4azc"
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s" self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ"
self.stringifiedObjectParams = { self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"} "shouldUseNestedResponseFormat": "true"}
self.cookies: dict[str, str] = {} self.cookies: dict[str, str] = {}
self.request_id: str = "req4q4tKw3woEEWxw&" self.request_id: str = "reqGjlEjOQFyRssam"
self.share_id: str = "shrQBuWjXd0YgPqV6" self.share_id: str = "shrQBuWjXd0YgPqV6"
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74"
self.headers = self._generate_headers() self.headers = self._generate_headers()
self.params = self._generate_params() self.params = self._generate_params()
self.cookies = {} self.cookies = {}
@ -66,7 +66,7 @@ class GoozaliFullRequest():
"shareId": self.share_id, "shareId": self.share_id,
"applicationId": self.application_id, "applicationId": self.application_id,
"generationNumber": 0, "generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z", "expires": "2025-01-30T00:00:00.000Z",
"signature": self.signature "signature": self.signature
} }
# Convert to a JSON string # Convert to a JSON string

View File

@ -1,11 +1,13 @@
from pydantic import BaseModel from pydantic import BaseModel
from jobs import Country, JobType, DescriptionFormat from jobs import Country, JobType, DescriptionFormat
from model.User import User
from scrapers.site import Site from scrapers.site import Site
class ScraperInput(BaseModel): class ScraperInput(BaseModel):
site_type: list[Site] site_type: list[Site]
user: User
search_term: str | None = None search_term: str | None = None
google_search_term: str | None = None google_search_term: str | None = None
@ -22,4 +24,4 @@ class ScraperInput(BaseModel):
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15 results_wanted: int = 15
hours_old: int | None = None hours_old: int | None = None

View File

@ -58,6 +58,7 @@ class TelegramDefaultHandler(TelegramHandler):
f"Start scarping: {site_names_print}") f"Start scarping: {site_names_print}")
filtered_out_jobs, jobs = scrape_jobs( filtered_out_jobs, jobs = scrape_jobs(
site_name=self.sites_to_scrap, site_name=self.sites_to_scrap,
user=user,
search_term=user.position.value, search_term=user.position.value,
locations=locations, locations=locations,
results_wanted=200, results_wanted=200,