From 9a4246901e69dc062fea3aecf16d71e74fade7f9 Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Mon, 23 Dec 2024 18:01:07 +0200 Subject: [PATCH] some fixes and logs --- src/jobspy/__init__.py | 32 ++++++++---- src/jobspy/main.py | 40 +++++++------- src/jobspy/scrapers/glassdoor/__init__.py | 64 ++++++++++++++--------- src/jobspy/scrapers/goozali/__init__.py | 1 + 4 files changed, 82 insertions(+), 55 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index d7b9b06..168b9da 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,4 +1,5 @@ from __future__ import annotations +from threading import Lock import pandas as pd from typing import Tuple @@ -118,23 +119,34 @@ def scrape_jobs( site_to_jobs_dict = {} merged_jobs: list[JobPost] = [] + lock = Lock() def worker(site): - site_val, scraped_info = scrape_site(site) - # Add the scraped jobs to the merged list - # Assuming scraped_info has 'jobs' as a list - merged_jobs.extend(scraped_info.jobs) + logger = create_logger(f"Worker {site}") + logger.info("Starting") + try: + site_val, scraped_info = scrape_site(site) + with lock: + merged_jobs.extend(scraped_info.jobs) + logger.info("Finished") + return site_val, scraped_info + except Exception as e: + logger.error(f"Error: {e}") + return None, None - return site_val, scraped_info - - with ThreadPoolExecutor() as executor: + with ThreadPoolExecutor(max_workers=5) as executor: + logger = create_logger("ThreadPoolExecutor") future_to_site = { executor.submit(worker, site): site for site in scraper_input.site_type } - + # An iterator over the given futures that yields each as it completes. for future in as_completed(future_to_site): - site_value, scraped_data = future.result() - site_to_jobs_dict[site_value] = scraped_data + try: + site_value, scraped_data = future.result() + if site_value and scraped_data: + site_to_jobs_dict[site_value] = scraped_data + except Exception as e: + logger.error(f"Future Error occurred: {e}") return merged_jobs diff --git a/src/jobspy/main.py b/src/jobspy/main.py index 06939e6..ac8218d 100644 --- a/src/jobspy/main.py +++ b/src/jobspy/main.py @@ -7,8 +7,8 @@ from jobspy.scrapers.utils import create_logger from jobspy.telegram_bot import TelegramBot logger = create_logger("Main") -filter_by_title: list[str] = ["test", "qa", "Lead", "Full-Stack", "Full Stack", "Fullstack", "Frontend", "Front-end", "Front End", "DevOps", "Physical", "Staff" - "data", "automation", "BI", "Principal", "Architect", "Android", "Machine Learning", "Student"] +filter_by_title: list[str] = ["test", "qa", "Lead", "Full-Stack", "Full Stack", "Fullstack", "Frontend", "Front-end", "Front End", "DevOps", "Physical", "Staff", + "automation", "BI", "Principal", "Architect", "Android", "Machine Learning", "Student", "Data Engineer"] def filter_jobs_by_title_name(job: JobPost): @@ -25,25 +25,23 @@ async def main(): telegramBot = TelegramBot() jobRepository = JobRepository() # sites_to_scrap = [Site.LINKEDIN, Site.GLASSDOOR, Site.INDEED, Site.GOOZALI] - sites_to_scrap = [Site.GOOZALI] - for site in sites_to_scrap: - jobs = scrape_jobs( - site_name=[site], - search_term="software engineer", - google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", - locations=["Tel Aviv, Israel", "Ramat Gan, Israel", - "Central, Israel", "Rehovot ,Israel"], - results_wanted=200, - hours_old=200, - country_indeed='israel' - ) - logger.info(f"Found {len(jobs)} jobs") - jobs = list(filter(filter_jobs_by_title_name, jobs)) - - newJobs = jobRepository.insertManyIfNotFound(jobs) - - for newJob in newJobs: - await telegramBot.sendJob(newJob) + sites_to_scrap = [Site.GLASSDOOR] + # sites_to_scrap = [Site.GOOZALI] + jobs = scrape_jobs( + site_name=sites_to_scrap, + search_term="software engineer", + google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", + locations=["Tel Aviv, Israel", "Ramat Gan, Israel", + "Central, Israel", "Rehovot ,Israel"], + results_wanted=200, + hours_old=48, + country_indeed='israel' + ) + logger.info(f"Found {len(jobs)} jobs") + jobs = list(filter(filter_jobs_by_title_name, jobs)) + newJobs = jobRepository.insertManyIfNotFound(jobs) + for newJob in newJobs: + await telegramBot.sendJob(newJob) # Run the async main function if __name__ == "__main__": diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index a41a71c..a5686a0 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -37,6 +37,7 @@ from ...jobs import ( logger = create_logger("Glassdoor") + class GlassdoorScraper(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None @@ -62,7 +63,8 @@ class GlassdoorScraper(Scraper): :return: JobResponse containing a list of jobs. """ self.scraper_input = scraper_input - self.scraper_input.results_wanted = min(900, scraper_input.results_wanted) + self.scraper_input.results_wanted = min( + 900, scraper_input.results_wanted) self.base_url = self.scraper_input.country.get_glassdoor_url() self.session = create_session( @@ -71,16 +73,17 @@ class GlassdoorScraper(Scraper): token = self._get_csrf_token() headers["gd-csrf-token"] = token if token else fallback_token self.session.headers.update(headers) - job_list: list[JobPost] = []; + job_list: list[JobPost] = [] for location in scraper_input.locations: glassDoorLocatiions: List[GlassDoorLocationResponse] = self._get_locations( location, scraper_input.is_remote ) for glassDoorLocatiion in glassDoorLocatiions: logger.info(f"Location: {glassDoorLocatiion.longName}") - locationType = get_location_type(glassDoorLocatiion); - locationId = get_location_id(glassDoorLocatiion); - jobs_temp = self.get_jobs(scraper_input,locationId,locationType); + locationType = get_location_type(glassDoorLocatiion) + locationId = get_location_id(glassDoorLocatiion) + jobs_temp = self.get_jobs( + scraper_input, locationId, locationType) if (jobs_temp is not None and len(jobs_temp) > 1): job_list.extend(jobs_temp) return JobResponse(jobs=job_list) @@ -99,7 +102,8 @@ class GlassdoorScraper(Scraper): jobs = [] self.scraper_input = scraper_input try: - payload = self._add_payload(location_id, location_type, page_num, cursor) + payload = self._add_payload( + location_id, location_type, page_num, cursor) response = self.session.post( f"{self.base_url}/graph", timeout_seconds=15, @@ -132,7 +136,8 @@ class GlassdoorScraper(Scraper): if job_post: jobs.append(job_post) except Exception as exc: - raise GlassdoorException(f"Glassdoor generated an exception: {exc}") + raise GlassdoorException( + f"Glassdoor generated an exception: {exc}") return jobs, self.get_cursor_for_page( res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 @@ -150,7 +155,8 @@ class GlassdoorScraper(Scraper): cursor = None range_start = 1 + (scraper_input.offset // self.jobs_per_page) - tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 + tot_pages = (scraper_input.results_wanted // + self.jobs_per_page) + 2 range_end = min(tot_pages, self.max_pages + 1) for page in range(range_start, range_end): logger.info(f"search page: {page} / {range_end-1}") @@ -174,7 +180,8 @@ class GlassdoorScraper(Scraper): """ Fetches csrf token needed for API by visiting a generic page """ - res = self.session.get(f"{self.base_url}/Job/computer-science-jobs.htm") + res = self.session.get( + f"{self.base_url}/Job/computer-science-jobs.htm") pattern = r'"token":\s*"([^"]+)"' matches = re.findall(pattern, res.text) token = None @@ -234,7 +241,8 @@ class GlassdoorScraper(Scraper): compensation=compensation, is_remote=is_remote, description=description, - emails=extract_emails_from_text(description) if description else None, + emails=extract_emails_from_text( + description) if description else None, company_logo=company_logo, listing_type=listing_type, ) @@ -280,7 +288,8 @@ class GlassdoorScraper(Scraper): def _get_location(self, location: str, is_remote: bool) -> (int, str): if not location or is_remote: return "11047", "STATE" # remote options - url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" + url = f"{ + self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" res = self.session.get(url) if res.status_code != 200: if res.status_code == 429: @@ -290,7 +299,8 @@ class GlassdoorScraper(Scraper): else: err = f"Glassdoor response status code {res.status_code}" err += f" - {res.text}" - logger.error(f"Glassdoor response status code {res.status_code}") + logger.error(f"Glassdoor response status code { + res.status_code}") return None, None items = res.json() @@ -304,17 +314,19 @@ class GlassdoorScraper(Scraper): location_type = "STATE" elif location_type == "N": location_type = "COUNTRY" - + return int(items[0]["locationId"]), location_type - + # Example string 'Tel Aviv, Israel' - def get_city_from_location(self, location:str) -> str: - return location.split(',')[0].strip() # Replace space with %2 to get "Tel%2Aviv" + def get_city_from_location(self, location: str) -> str: + # Replace space with %2 to get "Tel%2Aviv" + return location.split(',')[0].strip() def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]: if not location or is_remote: return "11047", "STATE" # remote options - url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" + url = f"{ + self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" res = self.session.get(url) if res.status_code != 200: if res.status_code == 429: @@ -324,7 +336,8 @@ class GlassdoorScraper(Scraper): else: err = f"Glassdoor response status code {res.status_code}" err += f" - {res.text}" - logger.error(f"Glassdoor response status code {res.status_code}") + logger.error(f"Glassdoor response status code { + res.status_code}") return None, None formatted_city = self.get_city_from_location(location) items: List[GlassDoorLocationResponse] = [ @@ -336,8 +349,8 @@ class GlassdoorScraper(Scraper): if not items: logger.error(f"location not found in Glassdoor: {location}") # raise ValueError(f"Location '{location}' not found on Glassdoor") - - return items; + + return items def _add_payload( self, @@ -351,9 +364,11 @@ class GlassdoorScraper(Scraper): fromage = max(self.scraper_input.hours_old // 24, 1) filter_params = [] if self.scraper_input.easy_apply: - filter_params.append({"filterKey": "applicationType", "values": "1"}) + filter_params.append( + {"filterKey": "applicationType", "values": "1"}) if fromage: - filter_params.append({"filterKey": "fromAge", "values": str(fromage)}) + filter_params.append( + {"filterKey": "fromAge", "values": str(fromage)}) payload = { "operationName": "JobSearchResultsQuery", "variables": { @@ -373,7 +388,8 @@ class GlassdoorScraper(Scraper): } if self.scraper_input.job_type: payload["variables"]["filterParams"].append( - {"filterKey": "jobType", "values": self.scraper_input.job_type.value[0]} + {"filterKey": "jobType", + "values": self.scraper_input.job_type.value[0]} ) return json.dumps([payload]) @@ -416,4 +432,4 @@ class GlassdoorScraper(Scraper): def get_cursor_for_page(pagination_cursors, page_num): for cursor_data in pagination_cursors: if cursor_data["pageNumber"] == page_num: - return cursor_data["cursor"] \ No newline at end of file + return cursor_data["cursor"] diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index 1cd2224..f3d05ea 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -71,6 +71,7 @@ class GoozaliScraper(Scraper): return JobResponse(jobs=job_list) except Exception as e: logger.error(f"Exception: {str(e)}") + return JobResponse(jobs=job_list) # model the response with models goozali_response = self.mapper.map_response_to_goozali_response( response=response)