added options to add more cities

2024-12-10 15:34:43 +02:00 · 2024-12-10 15:34:43 +02:00 · d388211a92
parent 7c002ae76e
commit d388211a92
5 changed files with 129 additions and 32 deletions
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@ -26,6 +26,7 @@ def scrape_jobs(
    search_term: str | None = None,
    google_search_term: str | None = None,
    location: str | None = None,
    locations: list[str] | None = None,
    distance: int | None = 50,
    is_remote: bool = False,
    job_type: str | None = None,
@ -89,6 +90,7 @@ def scrape_jobs(
        search_term=search_term,
        google_search_term=google_search_term,
        location=location,
        locations=locations,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
--- a/src/jobspy/scrapers/init.py
+++ b/src/jobspy/scrapers/init.py
@ -31,6 +31,7 @@ class ScraperInput(BaseModel):
    google_search_term: str | None = None
    location: str | None = None
    locations: list[str] | None = None
    country: Country | None = Country.USA
    distance: int | None = None
    is_remote: bool = False
--- a/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py
+++ b/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py
@ -0,0 +1,40 @@
 from typing import List
 from dataclasses import dataclass
@dataclass
 class GlassDoorLocationResponse:
    compoundId: str
    countryName: str
    id: str
    label: str
    locationId: int
    locationType: str
    longName: str
    realId: int
 def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str:
        """
        Private method to map locationType to a human-readable type.
        """
        if glassDoorLocationResponse.locationType == "C":
            return "CITY"
        elif glassDoorLocationResponse.locationType == "S":
            return "STATE"
        elif glassDoorLocationResponse.locationType == "N":
            return "COUNTRY"
        return "UNKNOWN"
 def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int:
        """
        Private method to map locationType to a human-readable type.
        """
        return int(glassDoorLocationResponse.locationId);
 def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]):
        """
        Loop over all locations and print the mapped location types.
        """
        for location in glassDoorLocationResponses:
            location_type = get_location_type(location)
            print(f"Location ID: {location.locationId}, Type: {location_type}")
--- a/src/jobspy/scrapers/glassdoor/init.py
+++ b/src/jobspy/scrapers/glassdoor/init.py
@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor.
 from __future__ import annotations
 from dataclasses import dataclass
 import re
 import json
 import requests
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 from datetime import datetime, timedelta
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
 from .constants import fallback_token, query_template, headers
 from .. import Scraper, ScraperInput, Site
 from ..utils import extract_emails_from_text, create_logger
@ -34,7 +37,6 @@ from ...jobs import (
 logger = create_logger("Glassdoor")
 class GlassdoorScraper(Scraper):
    def __init__(
        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper):
        token = self._get_csrf_token()
        headers["gd-csrf-token"] = token if token else fallback_token
        self.session.headers.update(headers)
-
+        job_list: list[JobPost] = [];
-        location_id, location_type = self._get_location(
+        for location in scraper_input.locations:
-            scraper_input.location, scraper_input.is_remote
+            glassDoorLocatiions = self._get_locations(
                location, scraper_input.is_remote
            )
-        if location_type is None:
+            for glassDoorLocatiion in glassDoorLocatiions:
-            logger.error("Glassdoor: location not parsed")
+                locationType = get_location_type(glassDoorLocatiion);
-            return JobResponse(jobs=[])
+                locationId = get_location_id(glassDoorLocatiion);
-        job_list: list[JobPost] = []
+                jobs_temp = self.get_jobs(scraper_input,locationId,locationType);
-        cursor = None
+                if (jobs_temp is not None and len(jobs_temp) > 1):
-
+                    job_list.extend(jobs_temp)
        range_start = 1 + (scraper_input.offset // self.jobs_per_page)
        tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
        range_end = min(tot_pages, self.max_pages + 1)
        for page in range(range_start, range_end):
            logger.info(f"search page: {page} / {range_end-1}")
            try:
                jobs, cursor = self._fetch_jobs_page(
                    scraper_input, location_id, location_type, page, cursor
                )
                job_list.extend(jobs)
                if not jobs or len(job_list) >= scraper_input.results_wanted:
                    job_list = job_list[: scraper_input.results_wanted]
                    break
            except Exception as e:
                logger.error(f"Glassdoor: {str(e)}")
                break
        return JobResponse(jobs=job_list)
    def _fetch_jobs_page(
@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper):
            res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
        )
    def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]:
        """
        Private method to fetch jobs from a specific page and return as a list.
        """
        try:
            if location_type is None:
                logger.error("Glassdoor: location not parsed")
                return JobResponse(jobs=[])
            job_list: list[JobPost] = []
            cursor = None
            range_start = 1 + (scraper_input.offset // self.jobs_per_page)
            tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
            range_end = min(tot_pages, self.max_pages + 1)
            for page in range(range_start, range_end):
                logger.info(f"search page: {page} / {range_end-1}")
                try:
                    jobs, cursor = self._fetch_jobs_page(
                        scraper_input, location_id, location_type, page, cursor
                    )
                    job_list.extend(jobs)
                    if not jobs or len(job_list) >= scraper_input.results_wanted:
                        job_list = job_list[: scraper_input.results_wanted]
                        break
                except Exception as e:
                    logger.error(f"Glassdoor: {str(e)}")
                    break
            return job_list
        except Exception as e:
            logger.error(f"Failed to fetch jobs from page {page}: {str(e)}")
            return []  # Return an empty list in case of failure
    def _get_csrf_token(self):
        """
        Fetches csrf token needed for API by visiting a generic page
@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper):
        items = res.json()
        if not items:
            logger.error(f"location not found in Glassdoor:  {location}")
            raise ValueError(f"Location '{location}' not found on Glassdoor")
        location_type = items[0]["locationType"]
        if location_type == "C":
@ -282,8 +302,41 @@ class GlassdoorScraper(Scraper):
            location_type = "STATE"
        elif location_type == "N":
            location_type = "COUNTRY"
        return int(items[0]["locationId"]), location_type
        # Example string 'Tel Aviv, Israel'
    def get_city_from_location(self, location:str) -> str:        
        return location.split(',')[0].strip()   # Replace space with %2 to get "Tel%2Aviv"
    def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]:
        if not location or is_remote:
            return "11047", "STATE"  # remote options
        url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
        res = self.session.get(url)
        if res.status_code != 200:
            if res.status_code == 429:
                err = f"429 Response - Blocked by Glassdoor for too many requests"
                logger.error(err)
                return None, None
            else:
                err = f"Glassdoor response status code {res.status_code}"
                err += f" - {res.text}"
                logger.error(f"Glassdoor response status code {res.status_code}")
                return None, None
        formatted_city = self.get_city_from_location(location)
        items: List[GlassDoorLocationResponse] = [
            GlassDoorLocationResponse(**item) for item in res.json()]
        # Filter items based on the processed city name
        items = [
            item for item in items if item.label is not None and formatted_city in item.label
        ]
        if not items:
            logger.error(f"location not found in Glassdoor:  {location}")
            # raise ValueError(f"Location '{location}' not found on Glassdoor")
        return items;
    def _add_payload(
        self,
        location_id: int,
--- a/src/main.py
+++ b/src/main.py
@ -7,9 +7,10 @@ jobs = scrape_jobs(
    site_name=["glassdoor"],
    search_term="software engineer",
    google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
-    location="Tel Aviv, israel",
+    location="Central, Israel",
-    results_wanted=20,
+    locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
-    hours_old=72,
+    results_wanted=200,
    hours_old=200,
    country_indeed='israel',
    # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)