From 00f13bdf1aadef586457767bd568e54b93a964d7 Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Thu, 12 Dec 2024 17:34:27 +0200 Subject: [PATCH] current plan to get the response from goozali --- src/jobspy/scrapers/goozali/__init__.py | 188 +----------------------- 1 file changed, 7 insertions(+), 181 deletions(-) diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index d6a1e4b..6e37b83 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -77,189 +77,15 @@ class GoozaliScraper(Scraper): self.scraper_input = scraper_input job_list: list[JobPost] = [] seen_ids = set() - start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0 - request_count = 0 - seconds_old = ( - scraper_input.hours_old * 3600 if scraper_input.hours_old else None - ) - continue_search = ( - lambda: len( - job_list) < scraper_input.results_wanted and start < 1000 - ) - for location in scraper_input.locations: - logger.info(f"start searching for location: {location}") - while continue_search(): - request_count += 1 - logger.info( - f"search page: { - request_count} / {math.ceil(scraper_input.results_wanted / 10)}" - ) - params = { - "keywords": scraper_input.search_term, - "location": location, - "distance": scraper_input.distance, - "f_WT": 2 if scraper_input.is_remote else None, - "f_JT": ( - self.job_type_code(scraper_input.job_type) - if scraper_input.job_type - else None - ), - "pageNum": 0, - "start": start, - "f_AL": "true" if scraper_input.easy_apply else None, - "f_C": ( - ",".join(map(str, scraper_input.Goozali_company_ids)) - if scraper_input.Goozali_company_ids - else None - ), - } - if seconds_old is not None: - params["f_TPR"] = f"r{seconds_old}" - - params = {k: v for k, v in params.items() if v is not None} - try: - response = self.session.get( - f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?", - params=params, - timeout=10, - ) - if response.status_code not in range(200, 400): - if response.status_code == 429: - err = ( - f"429 Response - Blocked by Goozali for too many requests" - ) - else: - err = f"Goozali response status code { - response.status_code}" - err += f" - {response.text}" - logger.error(err) - return JobResponse(jobs=job_list) - except MaxRetryError as e: - """Raised when the maximum number of retries is exceeded.""" - logger.error(f"RetryError: {str(e)}") - logger.error(f"MaxRetryError for location: {location}") - break - except RetryError as e: - """Custom retries logic failed""" - logger.error(f"RetryError: {str(e)}") - logger.error(f"RetryError for location: {location}") - break - except Exception as e: - if "Proxy responded with" in str(e): - logger.error(f"Goozali: Bad proxy") - else: - logger.error(f"Goozali: {str(e)}") - return JobResponse(jobs=job_list) - - soup = BeautifulSoup(response.text, "html.parser") - job_cards = soup.find_all("div", class_="base-search-card") - if len(job_cards) == 0: - break - - for job_card in job_cards: - href_tag = job_card.find( - "a", class_="base-card__full-link") - if href_tag and "href" in href_tag.attrs: - href = href_tag.attrs["href"].split("?")[0] - job_id = href.split("-")[-1] - - if job_id in seen_ids: - continue - seen_ids.add(job_id) - - try: - fetch_desc = scraper_input.Goozali_fetch_description - job_post = self._process_job( - job_card, job_id, fetch_desc) - if job_post: - job_list.append(job_post) - if not continue_search(): - break - except Exception as e: - raise GoozaliException(str(e)) - - if continue_search(): - time.sleep(random.uniform( - self.delay, self.delay + self.band_delay)) - start += len(job_list) - - job_list = job_list[: scraper_input.results_wanted] + # create url + # create session -> run the api + # model the response with models + # create map columnId to Column object + # filter result by Field like the web + # filter by date + # map to JobResponse Object return JobResponse(jobs=job_list) - def _process_job( - self, job_card: Tag, job_id: str, full_descr: bool - ) -> Optional[JobPost]: - salary_tag = job_card.find( - "span", class_="job-search-card__salary-info") - - compensation = None - if salary_tag: - salary_text = salary_tag.get_text(separator=" ").strip() - salary_values = [currency_parser(value) - for value in salary_text.split("-")] - salary_min = salary_values[0] - salary_max = salary_values[1] - currency = salary_text[0] if salary_text[0] != "$" else "USD" - - compensation = Compensation( - min_amount=int(salary_min), - max_amount=int(salary_max), - currency=currency, - ) - - title_tag = job_card.find("span", class_="sr-only") - title = title_tag.get_text(strip=True) if title_tag else "N/A" - - company_tag = job_card.find("h4", class_="base-search-card__subtitle") - company_a_tag = company_tag.find("a") if company_tag else None - company_url = ( - urlunparse(urlparse(company_a_tag.get("href"))._replace(query="")) - if company_a_tag and company_a_tag.has_attr("href") - else "" - ) - company = company_a_tag.get_text( - strip=True) if company_a_tag else "N/A" - - metadata_card = job_card.find( - "div", class_="base-search-card__metadata") - location = self._get_location(metadata_card) - - datetime_tag = ( - metadata_card.find("time", class_="job-search-card__listdate") - if metadata_card - else None - ) - date_posted = None - if datetime_tag and "datetime" in datetime_tag.attrs: - datetime_str = datetime_tag["datetime"] - try: - date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") - except: - date_posted = None - job_details = {} - if full_descr: - job_details = self._get_job_details(job_id) - - return JobPost( - id=f"li-{job_id}", - title=title, - company_name=company, - company_url=company_url, - location=location, - date_posted=date_posted, - datetime_posted=date_posted, - job_url=f"{self.base_url}/jobs/view/{job_id}", - compensation=compensation, - job_type=job_details.get("job_type"), - job_level=job_details.get("job_level", "").lower(), - company_industry=job_details.get("company_industry"), - description=job_details.get("description"), - job_url_direct=job_details.get("job_url_direct"), - emails=extract_emails_from_text(job_details.get("description")), - company_logo=job_details.get("company_logo"), - job_function=job_details.get("job_function"), - ) - def _get_job_details(self, job_id: str) -> dict: """ Retrieves job description and other job details by going to the job page url