diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 0ad21b8..851af24 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -26,6 +26,7 @@ def scrape_jobs( search_term: str | None = None, google_search_term: str | None = None, location: str | None = None, + locations: list[str] | None = None, distance: int | None = 50, is_remote: bool = False, job_type: str | None = None, @@ -89,6 +90,7 @@ def scrape_jobs( search_term=search_term, google_search_term=google_search_term, location=location, + locations=locations, distance=distance, is_remote=is_remote, job_type=job_type, diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 25c0841..41819f0 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -31,6 +31,7 @@ class ScraperInput(BaseModel): google_search_term: str | None = None location: str | None = None + locations: list[str] | None = None country: Country | None = Country.USA distance: int | None = None is_remote: bool = False diff --git a/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py b/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py new file mode 100644 index 0000000..a61525a --- /dev/null +++ b/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py @@ -0,0 +1,40 @@ +from typing import List +from dataclasses import dataclass + +@dataclass +class GlassDoorLocationResponse: + compoundId: str + countryName: str + id: str + label: str + locationId: int + locationType: str + longName: str + realId: int + + +def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str: + """ + Private method to map locationType to a human-readable type. + """ + if glassDoorLocationResponse.locationType == "C": + return "CITY" + elif glassDoorLocationResponse.locationType == "S": + return "STATE" + elif glassDoorLocationResponse.locationType == "N": + return "COUNTRY" + return "UNKNOWN" + +def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int: + """ + Private method to map locationType to a human-readable type. + """ + return int(glassDoorLocationResponse.locationId); + +def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]): + """ + Loop over all locations and print the mapped location types. + """ + for location in glassDoorLocationResponses: + location_type = get_location_type(location) + print(f"Location ID: {location.locationId}, Type: {location_type}") diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index d2de6dc..02ef817 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor. from __future__ import annotations +from dataclasses import dataclass import re import json import requests -from typing import Optional, Tuple +from typing import List, Optional, Tuple from datetime import datetime, timedelta from concurrent.futures import ThreadPoolExecutor, as_completed +from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type + from .constants import fallback_token, query_template, headers from .. import Scraper, ScraperInput, Site from ..utils import extract_emails_from_text, create_logger @@ -34,7 +37,6 @@ from ...jobs import ( logger = create_logger("Glassdoor") - class GlassdoorScraper(Scraper): def __init__( self, proxies: list[str] | str | None = None, ca_cert: str | None = None @@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper): token = self._get_csrf_token() headers["gd-csrf-token"] = token if token else fallback_token self.session.headers.update(headers) - - location_id, location_type = self._get_location( - scraper_input.location, scraper_input.is_remote - ) - if location_type is None: - logger.error("Glassdoor: location not parsed") - return JobResponse(jobs=[]) - job_list: list[JobPost] = [] - cursor = None - - range_start = 1 + (scraper_input.offset // self.jobs_per_page) - tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 - range_end = min(tot_pages, self.max_pages + 1) - for page in range(range_start, range_end): - logger.info(f"search page: {page} / {range_end-1}") - try: - jobs, cursor = self._fetch_jobs_page( - scraper_input, location_id, location_type, page, cursor - ) - job_list.extend(jobs) - if not jobs or len(job_list) >= scraper_input.results_wanted: - job_list = job_list[: scraper_input.results_wanted] - break - except Exception as e: - logger.error(f"Glassdoor: {str(e)}") - break + job_list: list[JobPost] = []; + for location in scraper_input.locations: + glassDoorLocatiions = self._get_locations( + location, scraper_input.is_remote + ) + for glassDoorLocatiion in glassDoorLocatiions: + locationType = get_location_type(glassDoorLocatiion); + locationId = get_location_id(glassDoorLocatiion); + jobs_temp = self.get_jobs(scraper_input,locationId,locationType); + if (jobs_temp is not None and len(jobs_temp) > 1): + job_list.extend(jobs_temp) return JobResponse(jobs=job_list) def _fetch_jobs_page( @@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper): res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 ) + def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]: + """ + Private method to fetch jobs from a specific page and return as a list. + """ + try: + if location_type is None: + logger.error("Glassdoor: location not parsed") + return JobResponse(jobs=[]) + job_list: list[JobPost] = [] + cursor = None + + range_start = 1 + (scraper_input.offset // self.jobs_per_page) + tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2 + range_end = min(tot_pages, self.max_pages + 1) + for page in range(range_start, range_end): + logger.info(f"search page: {page} / {range_end-1}") + try: + jobs, cursor = self._fetch_jobs_page( + scraper_input, location_id, location_type, page, cursor + ) + job_list.extend(jobs) + if not jobs or len(job_list) >= scraper_input.results_wanted: + job_list = job_list[: scraper_input.results_wanted] + break + except Exception as e: + logger.error(f"Glassdoor: {str(e)}") + break + return job_list + except Exception as e: + logger.error(f"Failed to fetch jobs from page {page}: {str(e)}") + return [] # Return an empty list in case of failure + def _get_csrf_token(self): """ Fetches csrf token needed for API by visiting a generic page @@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper): items = res.json() if not items: + logger.error(f"location not found in Glassdoor: {location}") raise ValueError(f"Location '{location}' not found on Glassdoor") location_type = items[0]["locationType"] if location_type == "C": @@ -282,7 +302,40 @@ class GlassdoorScraper(Scraper): location_type = "STATE" elif location_type == "N": location_type = "COUNTRY" + return int(items[0]["locationId"]), location_type + + # Example string 'Tel Aviv, Israel' + def get_city_from_location(self, location:str) -> str: + return location.split(',')[0].strip() # Replace space with %2 to get "Tel%2Aviv" + + def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]: + if not location or is_remote: + return "11047", "STATE" # remote options + url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" + res = self.session.get(url) + if res.status_code != 200: + if res.status_code == 429: + err = f"429 Response - Blocked by Glassdoor for too many requests" + logger.error(err) + return None, None + else: + err = f"Glassdoor response status code {res.status_code}" + err += f" - {res.text}" + logger.error(f"Glassdoor response status code {res.status_code}") + return None, None + formatted_city = self.get_city_from_location(location) + items: List[GlassDoorLocationResponse] = [ + GlassDoorLocationResponse(**item) for item in res.json()] + # Filter items based on the processed city name + items = [ + item for item in items if item.label is not None and formatted_city in item.label + ] + if not items: + logger.error(f"location not found in Glassdoor: {location}") + # raise ValueError(f"Location '{location}' not found on Glassdoor") + + return items; def _add_payload( self, @@ -361,4 +414,4 @@ class GlassdoorScraper(Scraper): def get_cursor_for_page(pagination_cursors, page_num): for cursor_data in pagination_cursors: if cursor_data["pageNumber"] == page_num: - return cursor_data["cursor"] + return cursor_data["cursor"] \ No newline at end of file diff --git a/src/main.py b/src/main.py index f259ed0..469829f 100644 --- a/src/main.py +++ b/src/main.py @@ -7,9 +7,10 @@ jobs = scrape_jobs( site_name=["glassdoor"], search_term="software engineer", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", - location="Tel Aviv, israel", - results_wanted=20, - hours_old=72, + location="Central, Israel", + locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"], + results_wanted=200, + hours_old=200, country_indeed='israel', # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)