current plan to get the response from goozali

pull/231/head
Yariv Menachem 2024-12-12 17:34:27 +02:00
parent 3dc15195d5
commit 00f13bdf1a
1 changed files with 7 additions and 181 deletions

View File

@ -77,189 +77,15 @@ class GoozaliScraper(Scraper):
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
request_count = 0
seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
continue_search = (
lambda: len(
job_list) < scraper_input.results_wanted and start < 1000
)
for location in scraper_input.locations:
logger.info(f"start searching for location: {location}")
while continue_search():
request_count += 1
logger.info(
f"search page: {
request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = {
"keywords": scraper_input.search_term,
"location": location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
self.job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
"pageNum": 0,
"start": start,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": (
",".join(map(str, scraper_input.Goozali_company_ids))
if scraper_input.Goozali_company_ids
else None
),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None}
try:
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
timeout=10,
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
err = (
f"429 Response - Blocked by Goozali for too many requests"
)
else:
err = f"Goozali response status code {
response.status_code}"
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list)
except MaxRetryError as e:
"""Raised when the maximum number of retries is exceeded."""
logger.error(f"RetryError: {str(e)}")
logger.error(f"MaxRetryError for location: {location}")
break
except RetryError as e:
"""Custom retries logic failed"""
logger.error(f"RetryError: {str(e)}")
logger.error(f"RetryError for location: {location}")
break
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Goozali: Bad proxy")
else:
logger.error(f"Goozali: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0:
break
for job_card in job_cards:
href_tag = job_card.find(
"a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
if job_id in seen_ids:
continue
seen_ids.add(job_id)
try:
fetch_desc = scraper_input.Goozali_fetch_description
job_post = self._process_job(
job_card, job_id, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
raise GoozaliException(str(e))
if continue_search():
time.sleep(random.uniform(
self.delay, self.delay + self.band_delay))
start += len(job_list)
job_list = job_list[: scraper_input.results_wanted]
# create url
# create session -> run the api
# model the response with models
# create map columnId to Column object
# filter result by Field like the web
# filter by date
# map to JobResponse Object
return JobResponse(jobs=job_list)
def _process_job(
self, job_card: Tag, job_id: str, full_descr: bool
) -> Optional[JobPost]:
salary_tag = job_card.find(
"span", class_="job-search-card__salary-info")
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=" ").strip()
salary_values = [currency_parser(value)
for value in salary_text.split("-")]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != "$" else "USD"
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(
strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find(
"div", class_="base-search-card__metadata")
location = self._get_location(metadata_card)
datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate")
if metadata_card
else None
)
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
job_details = {}
if full_descr:
job_details = self._get_job_details(job_id)
return JobPost(
id=f"li-{job_id}",
title=title,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
datetime_posted=date_posted,
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
job_level=job_details.get("job_level", "").lower(),
company_industry=job_details.get("company_industry"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"),
)
def _get_job_details(self, job_id: str) -> dict:
"""
Retrieves job description and other job details by going to the job page url