mirror of https://github.com/Bunsly/JobSpy
added options to add more cities
parent
7c002ae76e
commit
d388211a92
|
@ -26,6 +26,7 @@ def scrape_jobs(
|
|||
search_term: str | None = None,
|
||||
google_search_term: str | None = None,
|
||||
location: str | None = None,
|
||||
locations: list[str] | None = None,
|
||||
distance: int | None = 50,
|
||||
is_remote: bool = False,
|
||||
job_type: str | None = None,
|
||||
|
@ -89,6 +90,7 @@ def scrape_jobs(
|
|||
search_term=search_term,
|
||||
google_search_term=google_search_term,
|
||||
location=location,
|
||||
locations=locations,
|
||||
distance=distance,
|
||||
is_remote=is_remote,
|
||||
job_type=job_type,
|
||||
|
|
|
@ -31,6 +31,7 @@ class ScraperInput(BaseModel):
|
|||
google_search_term: str | None = None
|
||||
|
||||
location: str | None = None
|
||||
locations: list[str] | None = None
|
||||
country: Country | None = Country.USA
|
||||
distance: int | None = None
|
||||
is_remote: bool = False
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class GlassDoorLocationResponse:
|
||||
compoundId: str
|
||||
countryName: str
|
||||
id: str
|
||||
label: str
|
||||
locationId: int
|
||||
locationType: str
|
||||
longName: str
|
||||
realId: int
|
||||
|
||||
|
||||
def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str:
|
||||
"""
|
||||
Private method to map locationType to a human-readable type.
|
||||
"""
|
||||
if glassDoorLocationResponse.locationType == "C":
|
||||
return "CITY"
|
||||
elif glassDoorLocationResponse.locationType == "S":
|
||||
return "STATE"
|
||||
elif glassDoorLocationResponse.locationType == "N":
|
||||
return "COUNTRY"
|
||||
return "UNKNOWN"
|
||||
|
||||
def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int:
|
||||
"""
|
||||
Private method to map locationType to a human-readable type.
|
||||
"""
|
||||
return int(glassDoorLocationResponse.locationId);
|
||||
|
||||
def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]):
|
||||
"""
|
||||
Loop over all locations and print the mapped location types.
|
||||
"""
|
||||
for location in glassDoorLocationResponses:
|
||||
location_type = get_location_type(location)
|
||||
print(f"Location ID: {location.locationId}, Type: {location_type}")
|
|
@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
from typing import Optional, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
|
||||
|
||||
from .constants import fallback_token, query_template, headers
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import extract_emails_from_text, create_logger
|
||||
|
@ -34,7 +37,6 @@ from ...jobs import (
|
|||
|
||||
logger = create_logger("Glassdoor")
|
||||
|
||||
|
||||
class GlassdoorScraper(Scraper):
|
||||
def __init__(
|
||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||
|
@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper):
|
|||
token = self._get_csrf_token()
|
||||
headers["gd-csrf-token"] = token if token else fallback_token
|
||||
self.session.headers.update(headers)
|
||||
|
||||
location_id, location_type = self._get_location(
|
||||
scraper_input.location, scraper_input.is_remote
|
||||
)
|
||||
if location_type is None:
|
||||
logger.error("Glassdoor: location not parsed")
|
||||
return JobResponse(jobs=[])
|
||||
job_list: list[JobPost] = []
|
||||
cursor = None
|
||||
|
||||
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||
range_end = min(tot_pages, self.max_pages + 1)
|
||||
for page in range(range_start, range_end):
|
||||
logger.info(f"search page: {page} / {range_end-1}")
|
||||
try:
|
||||
jobs, cursor = self._fetch_jobs_page(
|
||||
scraper_input, location_id, location_type, page, cursor
|
||||
)
|
||||
job_list.extend(jobs)
|
||||
if not jobs or len(job_list) >= scraper_input.results_wanted:
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Glassdoor: {str(e)}")
|
||||
break
|
||||
job_list: list[JobPost] = [];
|
||||
for location in scraper_input.locations:
|
||||
glassDoorLocatiions = self._get_locations(
|
||||
location, scraper_input.is_remote
|
||||
)
|
||||
for glassDoorLocatiion in glassDoorLocatiions:
|
||||
locationType = get_location_type(glassDoorLocatiion);
|
||||
locationId = get_location_id(glassDoorLocatiion);
|
||||
jobs_temp = self.get_jobs(scraper_input,locationId,locationType);
|
||||
if (jobs_temp is not None and len(jobs_temp) > 1):
|
||||
job_list.extend(jobs_temp)
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def _fetch_jobs_page(
|
||||
|
@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper):
|
|||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||
)
|
||||
|
||||
def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]:
|
||||
"""
|
||||
Private method to fetch jobs from a specific page and return as a list.
|
||||
"""
|
||||
try:
|
||||
if location_type is None:
|
||||
logger.error("Glassdoor: location not parsed")
|
||||
return JobResponse(jobs=[])
|
||||
job_list: list[JobPost] = []
|
||||
cursor = None
|
||||
|
||||
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||
range_end = min(tot_pages, self.max_pages + 1)
|
||||
for page in range(range_start, range_end):
|
||||
logger.info(f"search page: {page} / {range_end-1}")
|
||||
try:
|
||||
jobs, cursor = self._fetch_jobs_page(
|
||||
scraper_input, location_id, location_type, page, cursor
|
||||
)
|
||||
job_list.extend(jobs)
|
||||
if not jobs or len(job_list) >= scraper_input.results_wanted:
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Glassdoor: {str(e)}")
|
||||
break
|
||||
return job_list
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch jobs from page {page}: {str(e)}")
|
||||
return [] # Return an empty list in case of failure
|
||||
|
||||
def _get_csrf_token(self):
|
||||
"""
|
||||
Fetches csrf token needed for API by visiting a generic page
|
||||
|
@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper):
|
|||
items = res.json()
|
||||
|
||||
if not items:
|
||||
logger.error(f"location not found in Glassdoor: {location}")
|
||||
raise ValueError(f"Location '{location}' not found on Glassdoor")
|
||||
location_type = items[0]["locationType"]
|
||||
if location_type == "C":
|
||||
|
@ -282,7 +302,40 @@ class GlassdoorScraper(Scraper):
|
|||
location_type = "STATE"
|
||||
elif location_type == "N":
|
||||
location_type = "COUNTRY"
|
||||
|
||||
return int(items[0]["locationId"]), location_type
|
||||
|
||||
# Example string 'Tel Aviv, Israel'
|
||||
def get_city_from_location(self, location:str) -> str:
|
||||
return location.split(',')[0].strip() # Replace space with %2 to get "Tel%2Aviv"
|
||||
|
||||
def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]:
|
||||
if not location or is_remote:
|
||||
return "11047", "STATE" # remote options
|
||||
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||
res = self.session.get(url)
|
||||
if res.status_code != 200:
|
||||
if res.status_code == 429:
|
||||
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||
logger.error(err)
|
||||
return None, None
|
||||
else:
|
||||
err = f"Glassdoor response status code {res.status_code}"
|
||||
err += f" - {res.text}"
|
||||
logger.error(f"Glassdoor response status code {res.status_code}")
|
||||
return None, None
|
||||
formatted_city = self.get_city_from_location(location)
|
||||
items: List[GlassDoorLocationResponse] = [
|
||||
GlassDoorLocationResponse(**item) for item in res.json()]
|
||||
# Filter items based on the processed city name
|
||||
items = [
|
||||
item for item in items if item.label is not None and formatted_city in item.label
|
||||
]
|
||||
if not items:
|
||||
logger.error(f"location not found in Glassdoor: {location}")
|
||||
# raise ValueError(f"Location '{location}' not found on Glassdoor")
|
||||
|
||||
return items;
|
||||
|
||||
def _add_payload(
|
||||
self,
|
||||
|
@ -361,4 +414,4 @@ class GlassdoorScraper(Scraper):
|
|||
def get_cursor_for_page(pagination_cursors, page_num):
|
||||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
||||
return cursor_data["cursor"]
|
|
@ -7,9 +7,10 @@ jobs = scrape_jobs(
|
|||
site_name=["glassdoor"],
|
||||
search_term="software engineer",
|
||||
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
||||
location="Tel Aviv, israel",
|
||||
results_wanted=20,
|
||||
hours_old=72,
|
||||
location="Central, Israel",
|
||||
locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
|
||||
results_wanted=200,
|
||||
hours_old=200,
|
||||
country_indeed='israel',
|
||||
|
||||
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
|
||||
|
|
Loading…
Reference in New Issue