fixed goozali

pull/231/head
Yariv Menachem 2025-01-06 15:43:21 +02:00
parent 0d01789313
commit 3db58a84a5
6 changed files with 43 additions and 27 deletions

View File

@ -12,6 +12,7 @@ from jobs import (
Country,
JobPost,
)
from model.User import User
from .glassdoor import GlassdoorScraper
from .google import GoogleJobsScraper
from .goozali import GoozaliScraper
@ -30,6 +31,7 @@ class SalarySource(Enum):
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
user: User = None,
search_term: str | None = None,
google_search_term: str | None = None,
location: str | None = None,
@ -93,6 +95,7 @@ def scrape_jobs(
country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput(
user=user,
site_type=get_site_type(),
country=country_enum,
search_term=search_term,

View File

@ -1,6 +1,6 @@
from datetime import datetime, timedelta
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice,GoozaliFieldChoice
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliFieldChoice
from ..utils import create_logger
# Mapping function to convert parsed dictionary into GoozaliResponseData
@ -13,12 +13,20 @@ class GoozaliScrapperComponent:
pass
# Function to filter GoozaliRows based on hours old
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]:
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn,
column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]:
return [
row for row in rows
if row.cellValuesByColumnId[column.id] == column_choice.id
row
for row in rows
if row.cellValuesByColumnId.get(column.id)
and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices)
]
# return [
# row for row in rows
# if row.cellValuesByColumnId[column.id] == column_choice.id
# ]
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
# Current time
now = datetime.now()
@ -39,14 +47,20 @@ class GoozaliScrapperComponent:
if (column.name == column_name):
return column
def find_choice_from_column(self, column: GoozaliColumn, choice_name: GoozaliFieldChoice) -> GoozaliColumnChoice:
def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[
GoozaliColumnChoice]:
if not column.typeOptions.choices:
logger.exception(f"Choices for column {column.name} doesn't exist")
raise Exception(f"Choices for column {column.name} doesn't exist")
chosen_values = [c.value for c in choices]
goozali_column_choices = []
for key, choice in column.typeOptions.choices.items():
if choice.name == choice_name.value:
return choice
if choice.name in chosen_values:
goozali_column_choices.append(choice)
logger.exception(f"Can't find {choice_name} for column {column.name}")
raise Exception(f"Can't find {choice_name} for column {column.name}")
if len(goozali_column_choices) == 0:
logger.exception(f"Can't find {choices} for column {column.name}")
raise Exception(f"Can't find {choices} for column {column.name}")
return goozali_column_choices

View File

@ -7,8 +7,10 @@ This module contains routines to scrape Goozali.
from __future__ import annotations
from model.User import User
from model.user_repository import user_repository
from jobs import (
JobPost,
JobResponse,
)
from .GoozaliMapper import GoozaliMapper
from .GoozaliScrapperComponent import GoozaliScrapperComponent
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
@ -16,14 +18,9 @@ from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, Goozal
from ..scraper import Scraper
from ..scraper_input import ScraperInput
from ..site import Site
from ..utils import create_dict_by_key_and_value, create_session, create_logger
from jobs import (
JobPost,
JobResponse,
)
logger = create_logger("Goozali")
logger = create_logger("GoozaliScraper")
class GoozaliScraper(Scraper):
@ -82,12 +79,11 @@ class GoozaliScraper(Scraper):
# filter result by Field
column = self.component.find_column(
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
user: User = user_repository.find_by_username()
user_goozali_field = position_to_goozali_field_map[user.position]
column_choice = self.component.find_choice_from_column(
column, user_goozali_field)
user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position]
column_choices = self.component.find_choices_from_column(
column, user_goozali_fields)
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
goozali_response.data.rows, column, column_choice)
goozali_response.data.rows, column, column_choices)
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
filtered_rows_by_column_choice, scraper_input.hours_old)
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(

View File

@ -6,13 +6,13 @@ class GoozaliFullRequest():
self.view_id: str = "viwIOzPYaUGxlA0Jd"
self.url = base_url.format(view_id=self.view_id)
self.application_id: str = "appwewqLk7iUY4azc"
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ"
self.stringifiedObjectParams = {
"shouldUseNestedResponseFormat": "true"}
self.cookies: dict[str, str] = {}
self.request_id: str = "req4q4tKw3woEEWxw&"
self.request_id: str = "reqGjlEjOQFyRssam"
self.share_id: str = "shrQBuWjXd0YgPqV6"
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74"
self.headers = self._generate_headers()
self.params = self._generate_params()
self.cookies = {}
@ -66,7 +66,7 @@ class GoozaliFullRequest():
"shareId": self.share_id,
"applicationId": self.application_id,
"generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z",
"expires": "2025-01-30T00:00:00.000Z",
"signature": self.signature
}
# Convert to a JSON string

View File

@ -1,11 +1,13 @@
from pydantic import BaseModel
from jobs import Country, JobType, DescriptionFormat
from model.User import User
from scrapers.site import Site
class ScraperInput(BaseModel):
site_type: list[Site]
user: User
search_term: str | None = None
google_search_term: str | None = None
@ -22,4 +24,4 @@ class ScraperInput(BaseModel):
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
results_wanted: int = 15
hours_old: int | None = None
hours_old: int | None = None

View File

@ -58,6 +58,7 @@ class TelegramDefaultHandler(TelegramHandler):
f"Start scarping: {site_names_print}")
filtered_out_jobs, jobs = scrape_jobs(
site_name=self.sites_to_scrap,
user=user,
search_term=user.position.value,
locations=locations,
results_wanted=200,