mirror of https://github.com/Bunsly/JobSpy
fixed goozali
parent
0d01789313
commit
3db58a84a5
|
@ -12,6 +12,7 @@ from jobs import (
|
||||||
Country,
|
Country,
|
||||||
JobPost,
|
JobPost,
|
||||||
)
|
)
|
||||||
|
from model.User import User
|
||||||
from .glassdoor import GlassdoorScraper
|
from .glassdoor import GlassdoorScraper
|
||||||
from .google import GoogleJobsScraper
|
from .google import GoogleJobsScraper
|
||||||
from .goozali import GoozaliScraper
|
from .goozali import GoozaliScraper
|
||||||
|
@ -30,6 +31,7 @@ class SalarySource(Enum):
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
|
user: User = None,
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
google_search_term: str | None = None,
|
google_search_term: str | None = None,
|
||||||
location: str | None = None,
|
location: str | None = None,
|
||||||
|
@ -93,6 +95,7 @@ def scrape_jobs(
|
||||||
|
|
||||||
country_enum = Country.from_string(country_indeed)
|
country_enum = Country.from_string(country_indeed)
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
|
user=user,
|
||||||
site_type=get_site_type(),
|
site_type=get_site_type(),
|
||||||
country=country_enum,
|
country=country_enum,
|
||||||
search_term=search_term,
|
search_term=search_term,
|
||||||
|
|
|
@ -13,12 +13,20 @@ class GoozaliScrapperComponent:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Function to filter GoozaliRows based on hours old
|
# Function to filter GoozaliRows based on hours old
|
||||||
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn, column_choice: GoozaliColumnChoice) -> list[GoozaliRow]:
|
def filter_rows_by_column_choice(self, rows: list[GoozaliRow], column: GoozaliColumn,
|
||||||
|
column_choices: list[GoozaliColumnChoice]) -> list[GoozaliRow]:
|
||||||
return [
|
return [
|
||||||
row for row in rows
|
row
|
||||||
if row.cellValuesByColumnId[column.id] == column_choice.id
|
for row in rows
|
||||||
|
if row.cellValuesByColumnId.get(column.id)
|
||||||
|
and any(choice.id == row.cellValuesByColumnId[column.id] for choice in column_choices)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# return [
|
||||||
|
# row for row in rows
|
||||||
|
# if row.cellValuesByColumnId[column.id] == column_choice.id
|
||||||
|
# ]
|
||||||
|
|
||||||
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
def filter_rows_by_hours(self, rows: list[GoozaliRow], hours: int) -> list[GoozaliRow]:
|
||||||
# Current time
|
# Current time
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
|
@ -39,14 +47,20 @@ class GoozaliScrapperComponent:
|
||||||
if (column.name == column_name):
|
if (column.name == column_name):
|
||||||
return column
|
return column
|
||||||
|
|
||||||
def find_choice_from_column(self, column: GoozaliColumn, choice_name: GoozaliFieldChoice) -> GoozaliColumnChoice:
|
def find_choices_from_column(self, column: GoozaliColumn, choices: list[GoozaliFieldChoice]) -> list[
|
||||||
|
GoozaliColumnChoice]:
|
||||||
if not column.typeOptions.choices:
|
if not column.typeOptions.choices:
|
||||||
logger.exception(f"Choices for column {column.name} doesn't exist")
|
logger.exception(f"Choices for column {column.name} doesn't exist")
|
||||||
raise Exception(f"Choices for column {column.name} doesn't exist")
|
raise Exception(f"Choices for column {column.name} doesn't exist")
|
||||||
|
chosen_values = [c.value for c in choices]
|
||||||
|
goozali_column_choices = []
|
||||||
|
|
||||||
for key, choice in column.typeOptions.choices.items():
|
for key, choice in column.typeOptions.choices.items():
|
||||||
if choice.name == choice_name.value:
|
if choice.name in chosen_values:
|
||||||
return choice
|
goozali_column_choices.append(choice)
|
||||||
|
|
||||||
logger.exception(f"Can't find {choice_name} for column {column.name}")
|
if len(goozali_column_choices) == 0:
|
||||||
raise Exception(f"Can't find {choice_name} for column {column.name}")
|
logger.exception(f"Can't find {choices} for column {column.name}")
|
||||||
|
raise Exception(f"Can't find {choices} for column {column.name}")
|
||||||
|
|
||||||
|
return goozali_column_choices
|
||||||
|
|
|
@ -7,8 +7,10 @@ This module contains routines to scrape Goozali.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from model.User import User
|
from jobs import (
|
||||||
from model.user_repository import user_repository
|
JobPost,
|
||||||
|
JobResponse,
|
||||||
|
)
|
||||||
from .GoozaliMapper import GoozaliMapper
|
from .GoozaliMapper import GoozaliMapper
|
||||||
from .GoozaliScrapperComponent import GoozaliScrapperComponent
|
from .GoozaliScrapperComponent import GoozaliScrapperComponent
|
||||||
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
|
from .constants import extract_goozali_column_name, job_post_column_to_goozali_column, position_to_goozali_field_map
|
||||||
|
@ -16,14 +18,9 @@ from .model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, Goozal
|
||||||
from ..scraper import Scraper
|
from ..scraper import Scraper
|
||||||
from ..scraper_input import ScraperInput
|
from ..scraper_input import ScraperInput
|
||||||
from ..site import Site
|
from ..site import Site
|
||||||
|
|
||||||
from ..utils import create_dict_by_key_and_value, create_session, create_logger
|
from ..utils import create_dict_by_key_and_value, create_session, create_logger
|
||||||
from jobs import (
|
|
||||||
JobPost,
|
|
||||||
JobResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = create_logger("Goozali")
|
logger = create_logger("GoozaliScraper")
|
||||||
|
|
||||||
|
|
||||||
class GoozaliScraper(Scraper):
|
class GoozaliScraper(Scraper):
|
||||||
|
@ -82,12 +79,11 @@ class GoozaliScraper(Scraper):
|
||||||
# filter result by Field
|
# filter result by Field
|
||||||
column = self.component.find_column(
|
column = self.component.find_column(
|
||||||
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
|
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
|
||||||
user: User = user_repository.find_by_username()
|
user_goozali_fields = position_to_goozali_field_map[scraper_input.user.position]
|
||||||
user_goozali_field = position_to_goozali_field_map[user.position]
|
column_choices = self.component.find_choices_from_column(
|
||||||
column_choice = self.component.find_choice_from_column(
|
column, user_goozali_fields)
|
||||||
column, user_goozali_field)
|
|
||||||
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
|
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
|
||||||
goozali_response.data.rows, column, column_choice)
|
goozali_response.data.rows, column, column_choices)
|
||||||
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
|
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
|
||||||
filtered_rows_by_column_choice, scraper_input.hours_old)
|
filtered_rows_by_column_choice, scraper_input.hours_old)
|
||||||
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
|
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
|
||||||
|
|
|
@ -6,13 +6,13 @@ class GoozaliFullRequest():
|
||||||
self.view_id: str = "viwIOzPYaUGxlA0Jd"
|
self.view_id: str = "viwIOzPYaUGxlA0Jd"
|
||||||
self.url = base_url.format(view_id=self.view_id)
|
self.url = base_url.format(view_id=self.view_id)
|
||||||
self.application_id: str = "appwewqLk7iUY4azc"
|
self.application_id: str = "appwewqLk7iUY4azc"
|
||||||
self.air_table_page_load_id: str = "pglqAAzFDZEWCEC7s"
|
self.air_table_page_load_id: str = "pglke45UFwdvQgBNJ"
|
||||||
self.stringifiedObjectParams = {
|
self.stringifiedObjectParams = {
|
||||||
"shouldUseNestedResponseFormat": "true"}
|
"shouldUseNestedResponseFormat": "true"}
|
||||||
self.cookies: dict[str, str] = {}
|
self.cookies: dict[str, str] = {}
|
||||||
self.request_id: str = "req4q4tKw3woEEWxw&"
|
self.request_id: str = "reqGjlEjOQFyRssam"
|
||||||
self.share_id: str = "shrQBuWjXd0YgPqV6"
|
self.share_id: str = "shrQBuWjXd0YgPqV6"
|
||||||
self.signature: str = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
|
self.signature: str = "7a1402a3f7f6f9a23c8db3849878812f2d3141da60f3b3d6e14dd4a910b91b74"
|
||||||
self.headers = self._generate_headers()
|
self.headers = self._generate_headers()
|
||||||
self.params = self._generate_params()
|
self.params = self._generate_params()
|
||||||
self.cookies = {}
|
self.cookies = {}
|
||||||
|
@ -66,7 +66,7 @@ class GoozaliFullRequest():
|
||||||
"shareId": self.share_id,
|
"shareId": self.share_id,
|
||||||
"applicationId": self.application_id,
|
"applicationId": self.application_id,
|
||||||
"generationNumber": 0,
|
"generationNumber": 0,
|
||||||
"expires": "2025-01-02T00:00:00.000Z",
|
"expires": "2025-01-30T00:00:00.000Z",
|
||||||
"signature": self.signature
|
"signature": self.signature
|
||||||
}
|
}
|
||||||
# Convert to a JSON string
|
# Convert to a JSON string
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from jobs import Country, JobType, DescriptionFormat
|
from jobs import Country, JobType, DescriptionFormat
|
||||||
|
from model.User import User
|
||||||
from scrapers.site import Site
|
from scrapers.site import Site
|
||||||
|
|
||||||
|
|
||||||
class ScraperInput(BaseModel):
|
class ScraperInput(BaseModel):
|
||||||
site_type: list[Site]
|
site_type: list[Site]
|
||||||
|
user: User
|
||||||
search_term: str | None = None
|
search_term: str | None = None
|
||||||
google_search_term: str | None = None
|
google_search_term: str | None = None
|
||||||
|
|
||||||
|
|
|
@ -58,6 +58,7 @@ class TelegramDefaultHandler(TelegramHandler):
|
||||||
f"Start scarping: {site_names_print}")
|
f"Start scarping: {site_names_print}")
|
||||||
filtered_out_jobs, jobs = scrape_jobs(
|
filtered_out_jobs, jobs = scrape_jobs(
|
||||||
site_name=self.sites_to_scrap,
|
site_name=self.sites_to_scrap,
|
||||||
|
user=user,
|
||||||
search_term=user.position.value,
|
search_term=user.position.value,
|
||||||
locations=locations,
|
locations=locations,
|
||||||
results_wanted=200,
|
results_wanted=200,
|
||||||
|
|
Loading…
Reference in New Issue