mirror of https://github.com/Bunsly/JobSpy
current plan to get the response from goozali
parent
3dc15195d5
commit
00f13bdf1a
|
@ -77,189 +77,15 @@ class GoozaliScraper(Scraper):
|
|||
self.scraper_input = scraper_input
|
||||
job_list: list[JobPost] = []
|
||||
seen_ids = set()
|
||||
start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
|
||||
request_count = 0
|
||||
seconds_old = (
|
||||
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
|
||||
)
|
||||
continue_search = (
|
||||
lambda: len(
|
||||
job_list) < scraper_input.results_wanted and start < 1000
|
||||
)
|
||||
for location in scraper_input.locations:
|
||||
logger.info(f"start searching for location: {location}")
|
||||
while continue_search():
|
||||
request_count += 1
|
||||
logger.info(
|
||||
f"search page: {
|
||||
request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
|
||||
)
|
||||
params = {
|
||||
"keywords": scraper_input.search_term,
|
||||
"location": location,
|
||||
"distance": scraper_input.distance,
|
||||
"f_WT": 2 if scraper_input.is_remote else None,
|
||||
"f_JT": (
|
||||
self.job_type_code(scraper_input.job_type)
|
||||
if scraper_input.job_type
|
||||
else None
|
||||
),
|
||||
"pageNum": 0,
|
||||
"start": start,
|
||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||
"f_C": (
|
||||
",".join(map(str, scraper_input.Goozali_company_ids))
|
||||
if scraper_input.Goozali_company_ids
|
||||
else None
|
||||
),
|
||||
}
|
||||
if seconds_old is not None:
|
||||
params["f_TPR"] = f"r{seconds_old}"
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
try:
|
||||
response = self.session.get(
|
||||
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||
params=params,
|
||||
timeout=10,
|
||||
)
|
||||
if response.status_code not in range(200, 400):
|
||||
if response.status_code == 429:
|
||||
err = (
|
||||
f"429 Response - Blocked by Goozali for too many requests"
|
||||
)
|
||||
else:
|
||||
err = f"Goozali response status code {
|
||||
response.status_code}"
|
||||
err += f" - {response.text}"
|
||||
logger.error(err)
|
||||
return JobResponse(jobs=job_list)
|
||||
except MaxRetryError as e:
|
||||
"""Raised when the maximum number of retries is exceeded."""
|
||||
logger.error(f"RetryError: {str(e)}")
|
||||
logger.error(f"MaxRetryError for location: {location}")
|
||||
break
|
||||
except RetryError as e:
|
||||
"""Custom retries logic failed"""
|
||||
logger.error(f"RetryError: {str(e)}")
|
||||
logger.error(f"RetryError for location: {location}")
|
||||
break
|
||||
except Exception as e:
|
||||
if "Proxy responded with" in str(e):
|
||||
logger.error(f"Goozali: Bad proxy")
|
||||
else:
|
||||
logger.error(f"Goozali: {str(e)}")
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
job_cards = soup.find_all("div", class_="base-search-card")
|
||||
if len(job_cards) == 0:
|
||||
break
|
||||
|
||||
for job_card in job_cards:
|
||||
href_tag = job_card.find(
|
||||
"a", class_="base-card__full-link")
|
||||
if href_tag and "href" in href_tag.attrs:
|
||||
href = href_tag.attrs["href"].split("?")[0]
|
||||
job_id = href.split("-")[-1]
|
||||
|
||||
if job_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(job_id)
|
||||
|
||||
try:
|
||||
fetch_desc = scraper_input.Goozali_fetch_description
|
||||
job_post = self._process_job(
|
||||
job_card, job_id, fetch_desc)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
if not continue_search():
|
||||
break
|
||||
except Exception as e:
|
||||
raise GoozaliException(str(e))
|
||||
|
||||
if continue_search():
|
||||
time.sleep(random.uniform(
|
||||
self.delay, self.delay + self.band_delay))
|
||||
start += len(job_list)
|
||||
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
# create url
|
||||
# create session -> run the api
|
||||
# model the response with models
|
||||
# create map columnId to Column object
|
||||
# filter result by Field like the web
|
||||
# filter by date
|
||||
# map to JobResponse Object
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _process_job(
|
||||
self, job_card: Tag, job_id: str, full_descr: bool
|
||||
) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find(
|
||||
"span", class_="job-search-card__salary-info")
|
||||
|
||||
compensation = None
|
||||
if salary_tag:
|
||||
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||
salary_values = [currency_parser(value)
|
||||
for value in salary_text.split("-")]
|
||||
salary_min = salary_values[0]
|
||||
salary_max = salary_values[1]
|
||||
currency = salary_text[0] if salary_text[0] != "$" else "USD"
|
||||
|
||||
compensation = Compensation(
|
||||
min_amount=int(salary_min),
|
||||
max_amount=int(salary_max),
|
||||
currency=currency,
|
||||
)
|
||||
|
||||
title_tag = job_card.find("span", class_="sr-only")
|
||||
title = title_tag.get_text(strip=True) if title_tag else "N/A"
|
||||
|
||||
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
|
||||
company_a_tag = company_tag.find("a") if company_tag else None
|
||||
company_url = (
|
||||
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
|
||||
if company_a_tag and company_a_tag.has_attr("href")
|
||||
else ""
|
||||
)
|
||||
company = company_a_tag.get_text(
|
||||
strip=True) if company_a_tag else "N/A"
|
||||
|
||||
metadata_card = job_card.find(
|
||||
"div", class_="base-search-card__metadata")
|
||||
location = self._get_location(metadata_card)
|
||||
|
||||
datetime_tag = (
|
||||
metadata_card.find("time", class_="job-search-card__listdate")
|
||||
if metadata_card
|
||||
else None
|
||||
)
|
||||
date_posted = None
|
||||
if datetime_tag and "datetime" in datetime_tag.attrs:
|
||||
datetime_str = datetime_tag["datetime"]
|
||||
try:
|
||||
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
|
||||
except:
|
||||
date_posted = None
|
||||
job_details = {}
|
||||
if full_descr:
|
||||
job_details = self._get_job_details(job_id)
|
||||
|
||||
return JobPost(
|
||||
id=f"li-{job_id}",
|
||||
title=title,
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
date_posted=date_posted,
|
||||
datetime_posted=date_posted,
|
||||
job_url=f"{self.base_url}/jobs/view/{job_id}",
|
||||
compensation=compensation,
|
||||
job_type=job_details.get("job_type"),
|
||||
job_level=job_details.get("job_level", "").lower(),
|
||||
company_industry=job_details.get("company_industry"),
|
||||
description=job_details.get("description"),
|
||||
job_url_direct=job_details.get("job_url_direct"),
|
||||
emails=extract_emails_from_text(job_details.get("description")),
|
||||
company_logo=job_details.get("company_logo"),
|
||||
job_function=job_details.get("job_function"),
|
||||
)
|
||||
|
||||
def _get_job_details(self, job_id: str) -> dict:
|
||||
"""
|
||||
Retrieves job description and other job details by going to the job page url
|
||||
|
|
Loading…
Reference in New Issue