mirror of https://github.com/Bunsly/JobSpy
fixed goozali
parent
0d01789313
commit
3db58a84a5
|
@ -12,6 +12,7 @@ from jobs import (
|
|||
Country,
|
||||
JobPost,
|
||||
)
|
||||
from model.User import User
|
||||
from .glassdoor import GlassdoorScraper
|
||||
from .google import GoogleJobsScraper
|
||||
from .goozali import GoozaliScraper
|
||||
|
@ -30,6 +31,7 @@ class SalarySource(Enum):
|
|||
|
||||
def scrape_jobs(
|
||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||
user: User = None,
|
||||
search_term: str | None = None,
|
||||
google_search_term: str | None = None,
|
||||
location: str | None = None,
|
||||
|
@ -93,6 +95,7 @@ def scrape_jobs(
|
|||
|
||||
country_enum = Country.from_string(country_indeed)
|
||||
scraper_input = ScraperInput(
|
||||
user=user,
|
||||
site_type=get_site_type(),
|
||||
country=country_enum,
|
||||
search_term=search_term,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from datetime import datetime, timedelta
|
||||
|
||||
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice,GoozaliFieldChoice
|
||||
from .model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliFieldChoice
|
||||
from ..utils import create_logger
|
||||
|
||||
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||
|
@ -13,12 +13,20 @@ class GoozaliScrapperComponent:
|
|||
pass
|
||||
|
||||
# Function to filter GoozaliRows based on hours old
|
||||
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]:
|
||||
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn,
|
||||
column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]:
|
||||
return [
|
||||
row for row in rows
|
||||
if row.cellValuesByColumnId[column.id] == column_choice.id
|
||||
row
|
||||
for row in rows
|
||||
if row.cellValuesByColumnId.get(column.id)
|
||||
and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices)
|
||||
]
|
||||
|
||||
# return [
|
||||
# row for row in rows
|
||||
# if row.cellValuesByColumnId[column.id] == column_choice.id
|
||||
# ]
|
||||
|
||||
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
||||
# Current time
|
||||
now = datetime.now()
|
||||
|
@ -39,14 +47,20 @@ class GoozaliScrapperComponent:
|
|||
if (column.name == column_name):
|
||||
return column
|
||||
|
||||
def find_choice_from_column(self, column: GoozaliColumn, choice_name: GoozaliFieldChoice) -> GoozaliColumnChoice:
|
||||
def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[
|
||||
GoozaliColumnChoice]:
|
||||
if not column.typeOptions.choices:
|
||||
logger.exception(f"Choices for column {column.name} doesn't exist")
|
||||
raise Exception(f"Choices for column {column.name} doesn't exist")
|
||||
chosen_values = [c.value for c in choices]
|
||||
goozali_column_choices = []
|
||||
|
||||
for key, choice in column.typeOptions.choices.items():
|
||||
if choice.name == choice_name.value:
|
||||
return choice
|
||||
if choice.name in chosen_values:
|
||||
goozali_column_choices.append(choice)
|
||||
|
||||
logger.exception(f"Can't find {choice_name} for column {column.name}")
|
||||
raise Exception(f"Can't find {choice_name} for column {column.name}")
|
||||
if len(goozali_column_choices) == 0:
|
||||
logger.exception(f"Can't find {choices} for column {column.name}")
|
||||
raise Exception(f"Can't find {choices} for column {column.name}")
|
||||
|
||||
return goozali_column_choices
|
||||
|
|
|
@ -7,8 +7,10 @@ This module contains routines to scrape Goozali.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
from model.User import User
|
||||
from model.user_repository import user_repository
|
||||
from jobs import (
|
||||
JobPost,
|
||||
JobResponse,
|
||||
)
|
||||
from .GoozaliMapper import GoozaliMapper
|
||||
from .GoozaliScrapperComponent import GoozaliScrapperComponent
|
||||
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
|
||||
|
@ -16,14 +18,9 @@ from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, Goozal
|
|||
from ..scraper import Scraper
|
||||
from ..scraper_input import ScraperInput
|
||||
from ..site import Site
|
||||
|
||||
from ..utils import create_dict_by_key_and_value, create_session, create_logger
|
||||
from jobs import (
|
||||
JobPost,
|
||||
JobResponse,
|
||||
)
|
||||
|
||||
logger = create_logger("Goozali")
|
||||
logger = create_logger("GoozaliScraper")
|
||||
|
||||
|
||||
class GoozaliScraper(Scraper):
|
||||
|
@ -82,12 +79,11 @@ class GoozaliScraper(Scraper):
|
|||
# filter result by Field
|
||||
column = self.component.find_column(
|
||||
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
|
||||
user: User = user_repository.find_by_username()
|
||||
user_goozali_field = position_to_goozali_field_map[user.position]
|
||||
column_choice = self.component.find_choice_from_column(
|
||||
column, user_goozali_field)
|
||||
user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position]
|
||||
column_choices = self.component.find_choices_from_column(
|
||||
column, user_goozali_fields)
|
||||
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
|
||||
goozali_response.data.rows, column, column_choice)
|
||||
goozali_response.data.rows, column, column_choices)
|
||||
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
|
||||
filtered_rows_by_column_choice, scraper_input.hours_old)
|
||||
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
|
||||
|
|
|
@ -6,13 +6,13 @@ class GoozaliFullRequest():
|
|||
self.view_id: str = "viwIOzPYaUGxlA0Jd"
|
||||
self.url = base_url.format(view_id=self.view_id)
|
||||
self.application_id: str = "appwewqLk7iUY4azc"
|
||||
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
|
||||
self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ"
|
||||
self.stringifiedObjectParams = {
|
||||
"shouldUseNestedResponseFormat": "true"}
|
||||
self.cookies: dict[str, str] = {}
|
||||
self.request_id: str = "req4q4tKw3woEEWxw&"
|
||||
self.request_id: str = "reqGjlEjOQFyRssam"
|
||||
self.share_id: str = "shrQBuWjXd0YgPqV6"
|
||||
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
|
||||
self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74"
|
||||
self.headers = self._generate_headers()
|
||||
self.params = self._generate_params()
|
||||
self.cookies = {}
|
||||
|
@ -66,7 +66,7 @@ class GoozaliFullRequest():
|
|||
"shareId": self.share_id,
|
||||
"applicationId": self.application_id,
|
||||
"generationNumber": 0,
|
||||
"expires": "2025-01-02T00:00:00.000Z",
|
||||
"expires": "2025-01-30T00:00:00.000Z",
|
||||
"signature": self.signature
|
||||
}
|
||||
# Convert to a JSON string
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
from jobs import Country, JobType, DescriptionFormat
|
||||
from model.User import User
|
||||
from scrapers.site import Site
|
||||
|
||||
|
||||
class ScraperInput(BaseModel):
|
||||
site_type: list[Site]
|
||||
user: User
|
||||
search_term: str | None = None
|
||||
google_search_term: str | None = None
|
||||
|
||||
|
@ -22,4 +24,4 @@ class ScraperInput(BaseModel):
|
|||
description_format: DescriptionFormat | None = DescriptionFormat.MARKDOWN
|
||||
|
||||
results_wanted: int = 15
|
||||
hours_old: int | None = None
|
||||
hours_old: int | None = None
|
||||
|
|
|
@ -58,6 +58,7 @@ class TelegramDefaultHandler(TelegramHandler):
|
|||
f"Start scarping: {site_names_print}")
|
||||
filtered_out_jobs, jobs = scrape_jobs(
|
||||
site_name=self.sites_to_scrap,
|
||||
user=user,
|
||||
search_term=user.position.value,
|
||||
locations=locations,
|
||||
results_wanted=200,
|
||||
|
|
Loading…
Reference in New Issue