mirror of https://github.com/Bunsly/JobSpy
added options to add more cities
parent
7c002ae76e
commit
d388211a92
|
@ -26,6 +26,7 @@ def scrape_jobs(
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
google_search_term: str | None = None,
|
google_search_term: str | None = None,
|
||||||
location: str | None = None,
|
location: str | None = None,
|
||||||
|
locations: list[str] | None = None,
|
||||||
distance: int | None = 50,
|
distance: int | None = 50,
|
||||||
is_remote: bool = False,
|
is_remote: bool = False,
|
||||||
job_type: str | None = None,
|
job_type: str | None = None,
|
||||||
|
@ -89,6 +90,7 @@ def scrape_jobs(
|
||||||
search_term=search_term,
|
search_term=search_term,
|
||||||
google_search_term=google_search_term,
|
google_search_term=google_search_term,
|
||||||
location=location,
|
location=location,
|
||||||
|
locations=locations,
|
||||||
distance=distance,
|
distance=distance,
|
||||||
is_remote=is_remote,
|
is_remote=is_remote,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
|
|
|
@ -31,6 +31,7 @@ class ScraperInput(BaseModel):
|
||||||
google_search_term: str | None = None
|
google_search_term: str | None = None
|
||||||
|
|
||||||
location: str | None = None
|
location: str | None = None
|
||||||
|
locations: list[str] | None = None
|
||||||
country: Country | None = Country.USA
|
country: Country | None = Country.USA
|
||||||
distance: int | None = None
|
distance: int | None = None
|
||||||
is_remote: bool = False
|
is_remote: bool = False
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
from typing import List
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GlassDoorLocationResponse:
|
||||||
|
compoundId: str
|
||||||
|
countryName: str
|
||||||
|
id: str
|
||||||
|
label: str
|
||||||
|
locationId: int
|
||||||
|
locationType: str
|
||||||
|
longName: str
|
||||||
|
realId: int
|
||||||
|
|
||||||
|
|
||||||
|
def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str:
|
||||||
|
"""
|
||||||
|
Private method to map locationType to a human-readable type.
|
||||||
|
"""
|
||||||
|
if glassDoorLocationResponse.locationType == "C":
|
||||||
|
return "CITY"
|
||||||
|
elif glassDoorLocationResponse.locationType == "S":
|
||||||
|
return "STATE"
|
||||||
|
elif glassDoorLocationResponse.locationType == "N":
|
||||||
|
return "COUNTRY"
|
||||||
|
return "UNKNOWN"
|
||||||
|
|
||||||
|
def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int:
|
||||||
|
"""
|
||||||
|
Private method to map locationType to a human-readable type.
|
||||||
|
"""
|
||||||
|
return int(glassDoorLocationResponse.locationId);
|
||||||
|
|
||||||
|
def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]):
|
||||||
|
"""
|
||||||
|
Loop over all locations and print the mapped location types.
|
||||||
|
"""
|
||||||
|
for location in glassDoorLocationResponses:
|
||||||
|
location_type = get_location_type(location)
|
||||||
|
print(f"Location ID: {location.locationId}, Type: {location_type}")
|
|
@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
from typing import Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
|
||||||
|
|
||||||
from .constants import fallback_token, query_template, headers
|
from .constants import fallback_token, query_template, headers
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import extract_emails_from_text, create_logger
|
from ..utils import extract_emails_from_text, create_logger
|
||||||
|
@ -34,7 +37,6 @@ from ...jobs import (
|
||||||
|
|
||||||
logger = create_logger("Glassdoor")
|
logger = create_logger("Glassdoor")
|
||||||
|
|
||||||
|
|
||||||
class GlassdoorScraper(Scraper):
|
class GlassdoorScraper(Scraper):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
||||||
|
@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper):
|
||||||
token = self._get_csrf_token()
|
token = self._get_csrf_token()
|
||||||
headers["gd-csrf-token"] = token if token else fallback_token
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
self.session.headers.update(headers)
|
self.session.headers.update(headers)
|
||||||
|
job_list: list[JobPost] = [];
|
||||||
location_id, location_type = self._get_location(
|
for location in scraper_input.locations:
|
||||||
scraper_input.location, scraper_input.is_remote
|
glassDoorLocatiions = self._get_locations(
|
||||||
|
location, scraper_input.is_remote
|
||||||
)
|
)
|
||||||
if location_type is None:
|
for glassDoorLocatiion in glassDoorLocatiions:
|
||||||
logger.error("Glassdoor: location not parsed")
|
locationType = get_location_type(glassDoorLocatiion);
|
||||||
return JobResponse(jobs=[])
|
locationId = get_location_id(glassDoorLocatiion);
|
||||||
job_list: list[JobPost] = []
|
jobs_temp = self.get_jobs(scraper_input,locationId,locationType);
|
||||||
cursor = None
|
if (jobs_temp is not None and len(jobs_temp) > 1):
|
||||||
|
job_list.extend(jobs_temp)
|
||||||
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
|
||||||
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
|
||||||
range_end = min(tot_pages, self.max_pages + 1)
|
|
||||||
for page in range(range_start, range_end):
|
|
||||||
logger.info(f"search page: {page} / {range_end-1}")
|
|
||||||
try:
|
|
||||||
jobs, cursor = self._fetch_jobs_page(
|
|
||||||
scraper_input, location_id, location_type, page, cursor
|
|
||||||
)
|
|
||||||
job_list.extend(jobs)
|
|
||||||
if not jobs or len(job_list) >= scraper_input.results_wanted:
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Glassdoor: {str(e)}")
|
|
||||||
break
|
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _fetch_jobs_page(
|
def _fetch_jobs_page(
|
||||||
|
@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper):
|
||||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]:
|
||||||
|
"""
|
||||||
|
Private method to fetch jobs from a specific page and return as a list.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if location_type is None:
|
||||||
|
logger.error("Glassdoor: location not parsed")
|
||||||
|
return JobResponse(jobs=[])
|
||||||
|
job_list: list[JobPost] = []
|
||||||
|
cursor = None
|
||||||
|
|
||||||
|
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
|
||||||
|
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
|
||||||
|
range_end = min(tot_pages, self.max_pages + 1)
|
||||||
|
for page in range(range_start, range_end):
|
||||||
|
logger.info(f"search page: {page} / {range_end-1}")
|
||||||
|
try:
|
||||||
|
jobs, cursor = self._fetch_jobs_page(
|
||||||
|
scraper_input, location_id, location_type, page, cursor
|
||||||
|
)
|
||||||
|
job_list.extend(jobs)
|
||||||
|
if not jobs or len(job_list) >= scraper_input.results_wanted:
|
||||||
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Glassdoor: {str(e)}")
|
||||||
|
break
|
||||||
|
return job_list
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to fetch jobs from page {page}: {str(e)}")
|
||||||
|
return [] # Return an empty list in case of failure
|
||||||
|
|
||||||
def _get_csrf_token(self):
|
def _get_csrf_token(self):
|
||||||
"""
|
"""
|
||||||
Fetches csrf token needed for API by visiting a generic page
|
Fetches csrf token needed for API by visiting a generic page
|
||||||
|
@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper):
|
||||||
items = res.json()
|
items = res.json()
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
|
logger.error(f"location not found in Glassdoor: {location}")
|
||||||
raise ValueError(f"Location '{location}' not found on Glassdoor")
|
raise ValueError(f"Location '{location}' not found on Glassdoor")
|
||||||
location_type = items[0]["locationType"]
|
location_type = items[0]["locationType"]
|
||||||
if location_type == "C":
|
if location_type == "C":
|
||||||
|
@ -282,8 +302,41 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type = "STATE"
|
location_type = "STATE"
|
||||||
elif location_type == "N":
|
elif location_type == "N":
|
||||||
location_type = "COUNTRY"
|
location_type = "COUNTRY"
|
||||||
|
|
||||||
return int(items[0]["locationId"]), location_type
|
return int(items[0]["locationId"]), location_type
|
||||||
|
|
||||||
|
# Example string 'Tel Aviv, Israel'
|
||||||
|
def get_city_from_location(self, location:str) -> str:
|
||||||
|
return location.split(',')[0].strip() # Replace space with %2 to get "Tel%2Aviv"
|
||||||
|
|
||||||
|
def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]:
|
||||||
|
if not location or is_remote:
|
||||||
|
return "11047", "STATE" # remote options
|
||||||
|
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||||
|
res = self.session.get(url)
|
||||||
|
if res.status_code != 200:
|
||||||
|
if res.status_code == 429:
|
||||||
|
err = f"429 Response - Blocked by Glassdoor for too many requests"
|
||||||
|
logger.error(err)
|
||||||
|
return None, None
|
||||||
|
else:
|
||||||
|
err = f"Glassdoor response status code {res.status_code}"
|
||||||
|
err += f" - {res.text}"
|
||||||
|
logger.error(f"Glassdoor response status code {res.status_code}")
|
||||||
|
return None, None
|
||||||
|
formatted_city = self.get_city_from_location(location)
|
||||||
|
items: List[GlassDoorLocationResponse] = [
|
||||||
|
GlassDoorLocationResponse(**item) for item in res.json()]
|
||||||
|
# Filter items based on the processed city name
|
||||||
|
items = [
|
||||||
|
item for item in items if item.label is not None and formatted_city in item.label
|
||||||
|
]
|
||||||
|
if not items:
|
||||||
|
logger.error(f"location not found in Glassdoor: {location}")
|
||||||
|
# raise ValueError(f"Location '{location}' not found on Glassdoor")
|
||||||
|
|
||||||
|
return items;
|
||||||
|
|
||||||
def _add_payload(
|
def _add_payload(
|
||||||
self,
|
self,
|
||||||
location_id: int,
|
location_id: int,
|
||||||
|
|
|
@ -7,9 +7,10 @@ jobs = scrape_jobs(
|
||||||
site_name=["glassdoor"],
|
site_name=["glassdoor"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
|
||||||
location="Tel Aviv, israel",
|
location="Central, Israel",
|
||||||
results_wanted=20,
|
locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
|
||||||
hours_old=72,
|
results_wanted=200,
|
||||||
|
hours_old=200,
|
||||||
country_indeed='israel',
|
country_indeed='israel',
|
||||||
|
|
||||||
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
|
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
|
||||||
|
|
Loading…
Reference in New Issue