added options to add more cities

2024-12-10 15:34:43 +02:00 · 2024-12-10 15:34:43 +02:00 · d388211a92
parent 7c002ae76e
commit d388211a92
5 changed files with 129 additions and 32 deletions
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@ -26,6 +26,7 @@ def scrape_jobs(
    search_term: str | None = None,
    google_search_term: str | None = None,
    location: str | None = None,
+    locations: list[str] | None = None,
    distance: int | None = 50,
    is_remote: bool = False,
    job_type: str | None = None,
@ -89,6 +90,7 @@ def scrape_jobs(
        search_term=search_term,
        google_search_term=google_search_term,
        location=location,
+        locations=locations,
        distance=distance,
        is_remote=is_remote,
        job_type=job_type,
--- a/src/jobspy/scrapers/init.py
+++ b/src/jobspy/scrapers/init.py
@ -31,6 +31,7 @@ class ScraperInput(BaseModel):
    google_search_term: str | None = None

    location: str | None = None
+    locations: list[str] | None = None
    country: Country | None = Country.USA
    distance: int | None = None
    is_remote: bool = False
--- a/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py
+++ b/src/jobspy/scrapers/glassdoor/GlassDoorLocation.py
@ -0,0 +1,40 @@
+from typing import List
+from dataclasses import dataclass
+
+@dataclass
+class GlassDoorLocationResponse:
+    compoundId: str
+    countryName: str
+    id: str
+    label: str
+    locationId: int
+    locationType: str
+    longName: str
+    realId: int
+
+
+def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str:
+        """
+        Private method to map locationType to a human-readable type.
+        """
+        if glassDoorLocationResponse.locationType == "C":
+            return "CITY"
+        elif glassDoorLocationResponse.locationType == "S":
+            return "STATE"
+        elif glassDoorLocationResponse.locationType == "N":
+            return "COUNTRY"
+        return "UNKNOWN"
+    
+def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int:
+        """
+        Private method to map locationType to a human-readable type.
+        """
+        return int(glassDoorLocationResponse.locationId);
+
+def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]):
+        """
+        Loop over all locations and print the mapped location types.
+        """
+        for location in glassDoorLocationResponses:
+            location_type = get_location_type(location)
+            print(f"Location ID: {location.locationId}, Type: {location_type}")
--- a/src/jobspy/scrapers/glassdoor/init.py
+++ b/src/jobspy/scrapers/glassdoor/init.py
@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor.

 from __future__ import annotations

+from dataclasses import dataclass
 import re
 import json
 import requests
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 from datetime import datetime, timedelta
 from concurrent.futures import ThreadPoolExecutor, as_completed

+from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
+
 from .constants import fallback_token, query_template, headers
 from .. import Scraper, ScraperInput, Site
 from ..utils import extract_emails_from_text, create_logger
@ -34,7 +37,6 @@ from ...jobs import (

 logger = create_logger("Glassdoor")

-
 class GlassdoorScraper(Scraper):
    def __init__(
        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper):
        token = self._get_csrf_token()
        headers["gd-csrf-token"] = token if token else fallback_token
        self.session.headers.update(headers)
-
-        location_id, location_type = self._get_location(
-            scraper_input.location, scraper_input.is_remote
-        )
-        if location_type is None:
-            logger.error("Glassdoor: location not parsed")
-            return JobResponse(jobs=[])
-        job_list: list[JobPost] = []
-        cursor = None
-
-        range_start = 1 + (scraper_input.offset // self.jobs_per_page)
-        tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
-        range_end = min(tot_pages, self.max_pages + 1)
-        for page in range(range_start, range_end):
-            logger.info(f"search page: {page} / {range_end-1}")
-            try:
-                jobs, cursor = self._fetch_jobs_page(
-                    scraper_input, location_id, location_type, page, cursor
-                )
-                job_list.extend(jobs)
-                if not jobs or len(job_list) >= scraper_input.results_wanted:
-                    job_list = job_list[: scraper_input.results_wanted]
-                    break
-            except Exception as e:
-                logger.error(f"Glassdoor: {str(e)}")
-                break
+        job_list: list[JobPost] = [];
+        for location in scraper_input.locations:
+            glassDoorLocatiions = self._get_locations(
+                location, scraper_input.is_remote
+            )
+            for glassDoorLocatiion in glassDoorLocatiions:
+                locationType = get_location_type(glassDoorLocatiion);
+                locationId = get_location_id(glassDoorLocatiion);
+                jobs_temp = self.get_jobs(scraper_input,locationId,locationType);
+                if (jobs_temp is not None and len(jobs_temp) > 1):
+                    job_list.extend(jobs_temp)
        return JobResponse(jobs=job_list)

    def _fetch_jobs_page(
@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper):
            res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
        )

+    def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]:
+        """
+        Private method to fetch jobs from a specific page and return as a list.
+        """
+        try:
+            if location_type is None:
+                logger.error("Glassdoor: location not parsed")
+                return JobResponse(jobs=[])
+            job_list: list[JobPost] = []
+            cursor = None
+
+            range_start = 1 + (scraper_input.offset // self.jobs_per_page)
+            tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
+            range_end = min(tot_pages, self.max_pages + 1)
+            for page in range(range_start, range_end):
+                logger.info(f"search page: {page} / {range_end-1}")
+                try:
+                    jobs, cursor = self._fetch_jobs_page(
+                        scraper_input, location_id, location_type, page, cursor
+                    )
+                    job_list.extend(jobs)
+                    if not jobs or len(job_list) >= scraper_input.results_wanted:
+                        job_list = job_list[: scraper_input.results_wanted]
+                        break
+                except Exception as e:
+                    logger.error(f"Glassdoor: {str(e)}")
+                    break
+            return job_list
+        except Exception as e:
+            logger.error(f"Failed to fetch jobs from page {page}: {str(e)}")
+            return []  # Return an empty list in case of failure
+
    def _get_csrf_token(self):
        """
        Fetches csrf token needed for API by visiting a generic page
@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper):
        items = res.json()

        if not items:
+            logger.error(f"location not found in Glassdoor:  {location}")
            raise ValueError(f"Location '{location}' not found on Glassdoor")
        location_type = items[0]["locationType"]
        if location_type == "C":
@ -282,7 +302,40 @@ class GlassdoorScraper(Scraper):
            location_type = "STATE"
        elif location_type == "N":
            location_type = "COUNTRY"
+        
        return int(items[0]["locationId"]), location_type
+    
+        # Example string 'Tel Aviv, Israel'
+    def get_city_from_location(self, location:str) -> str:        
+        return location.split(',')[0].strip()   # Replace space with %2 to get "Tel%2Aviv"
+
+    def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]:
+        if not location or is_remote:
+            return "11047", "STATE"  # remote options
+        url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
+        res = self.session.get(url)
+        if res.status_code != 200:
+            if res.status_code == 429:
+                err = f"429 Response - Blocked by Glassdoor for too many requests"
+                logger.error(err)
+                return None, None
+            else:
+                err = f"Glassdoor response status code {res.status_code}"
+                err += f" - {res.text}"
+                logger.error(f"Glassdoor response status code {res.status_code}")
+                return None, None
+        formatted_city = self.get_city_from_location(location)
+        items: List[GlassDoorLocationResponse] = [
+            GlassDoorLocationResponse(**item) for item in res.json()]
+        # Filter items based on the processed city name
+        items = [
+            item for item in items if item.label is not None and formatted_city in item.label
+        ]
+        if not items:
+            logger.error(f"location not found in Glassdoor:  {location}")
+            # raise ValueError(f"Location '{location}' not found on Glassdoor")
+        
+        return items;

    def _add_payload(
        self,
@ -361,4 +414,4 @@ class GlassdoorScraper(Scraper):
    def get_cursor_for_page(pagination_cursors, page_num):
        for cursor_data in pagination_cursors:
            if cursor_data["pageNumber"] == page_num:
-                return cursor_data["cursor"]
+                return cursor_data["cursor"]
--- a/src/main.py
+++ b/src/main.py
@ -7,9 +7,10 @@ jobs = scrape_jobs(
    site_name=["glassdoor"],
    search_term="software engineer",
    google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
-    location="Tel Aviv, israel",
-    results_wanted=20,
-    hours_old=72,
+    location="Central, Israel",
+    locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
+    results_wanted=200,
+    hours_old=200,
    country_indeed='israel',
    
    # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)