added options to add more cities

pull/231/head
Yariv Menachem 2024-12-10 15:34:43 +02:00
parent 7c002ae76e
commit d388211a92
5 changed files with 129 additions and 32 deletions

View File

@ -26,6 +26,7 @@ def scrape_jobs(
search_term: str | None = None, search_term: str | None = None,
google_search_term: str | None = None, google_search_term: str | None = None,
location: str | None = None, location: str | None = None,
locations: list[str] | None = None,
distance: int | None = 50, distance: int | None = 50,
is_remote: bool = False, is_remote: bool = False,
job_type: str | None = None, job_type: str | None = None,
@ -89,6 +90,7 @@ def scrape_jobs(
search_term=search_term, search_term=search_term,
google_search_term=google_search_term, google_search_term=google_search_term,
location=location, location=location,
locations=locations,
distance=distance, distance=distance,
is_remote=is_remote, is_remote=is_remote,
job_type=job_type, job_type=job_type,

View File

@ -31,6 +31,7 @@ class ScraperInput(BaseModel):
google_search_term: str | None = None google_search_term: str | None = None
location: str | None = None location: str | None = None
locations: list[str] | None = None
country: Country | None = Country.USA country: Country | None = Country.USA
distance: int | None = None distance: int | None = None
is_remote: bool = False is_remote: bool = False

View File

@ -0,0 +1,40 @@
from typing import List
from dataclasses import dataclass
@dataclass
class GlassDoorLocationResponse:
compoundId: str
countryName: str
id: str
label: str
locationId: int
locationType: str
longName: str
realId: int
def get_location_type(glassDoorLocationResponse: GlassDoorLocationResponse) -> str:
"""
Private method to map locationType to a human-readable type.
"""
if glassDoorLocationResponse.locationType == "C":
return "CITY"
elif glassDoorLocationResponse.locationType == "S":
return "STATE"
elif glassDoorLocationResponse.locationType == "N":
return "COUNTRY"
return "UNKNOWN"
def get_location_id(glassDoorLocationResponse: GlassDoorLocationResponse) -> int:
"""
Private method to map locationType to a human-readable type.
"""
return int(glassDoorLocationResponse.locationId);
def print_locations(glassDoorLocationResponses: list[GlassDoorLocationResponse]):
"""
Loop over all locations and print the mapped location types.
"""
for location in glassDoorLocationResponses:
location_type = get_location_type(location)
print(f"Location ID: {location.locationId}, Type: {location_type}")

View File

@ -7,13 +7,16 @@ This module contains routines to scrape Glassdoor.
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass
import re import re
import json import json
import requests import requests
from typing import Optional, Tuple from typing import List, Optional, Tuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from jobspy.scrapers.glassdoor.GlassDoorLocation import GlassDoorLocationResponse, get_location_id, get_location_type
from .constants import fallback_token, query_template, headers from .constants import fallback_token, query_template, headers
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import extract_emails_from_text, create_logger from ..utils import extract_emails_from_text, create_logger
@ -34,7 +37,6 @@ from ...jobs import (
logger = create_logger("Glassdoor") logger = create_logger("Glassdoor")
class GlassdoorScraper(Scraper): class GlassdoorScraper(Scraper):
def __init__( def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None self, proxies: list[str] | str | None = None, ca_cert: str | None = None
@ -69,32 +71,17 @@ class GlassdoorScraper(Scraper):
token = self._get_csrf_token() token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token headers["gd-csrf-token"] = token if token else fallback_token
self.session.headers.update(headers) self.session.headers.update(headers)
job_list: list[JobPost] = [];
location_id, location_type = self._get_location( for location in scraper_input.locations:
scraper_input.location, scraper_input.is_remote glassDoorLocatiions = self._get_locations(
location, scraper_input.is_remote
) )
if location_type is None: for glassDoorLocatiion in glassDoorLocatiions:
logger.error("Glassdoor: location not parsed") locationType = get_location_type(glassDoorLocatiion);
return JobResponse(jobs=[]) locationId = get_location_id(glassDoorLocatiion);
job_list: list[JobPost] = [] jobs_temp = self.get_jobs(scraper_input,locationId,locationType);
cursor = None if (jobs_temp is not None and len(jobs_temp) > 1):
job_list.extend(jobs_temp)
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
job_list.extend(jobs)
if not jobs or len(job_list) >= scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
break
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _fetch_jobs_page( def _fetch_jobs_page(
@ -150,6 +137,38 @@ class GlassdoorScraper(Scraper):
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
def get_jobs(self, scraper_input, location_id: int, location_type: str) -> List[JobPost]:
"""
Private method to fetch jobs from a specific page and return as a list.
"""
try:
if location_type is None:
logger.error("Glassdoor: location not parsed")
return JobResponse(jobs=[])
job_list: list[JobPost] = []
cursor = None
range_start = 1 + (scraper_input.offset // self.jobs_per_page)
tot_pages = (scraper_input.results_wanted // self.jobs_per_page) + 2
range_end = min(tot_pages, self.max_pages + 1)
for page in range(range_start, range_end):
logger.info(f"search page: {page} / {range_end-1}")
try:
jobs, cursor = self._fetch_jobs_page(
scraper_input, location_id, location_type, page, cursor
)
job_list.extend(jobs)
if not jobs or len(job_list) >= scraper_input.results_wanted:
job_list = job_list[: scraper_input.results_wanted]
break
except Exception as e:
logger.error(f"Glassdoor: {str(e)}")
break
return job_list
except Exception as e:
logger.error(f"Failed to fetch jobs from page {page}: {str(e)}")
return [] # Return an empty list in case of failure
def _get_csrf_token(self): def _get_csrf_token(self):
""" """
Fetches csrf token needed for API by visiting a generic page Fetches csrf token needed for API by visiting a generic page
@ -274,6 +293,7 @@ class GlassdoorScraper(Scraper):
items = res.json() items = res.json()
if not items: if not items:
logger.error(f"location not found in Glassdoor: {location}")
raise ValueError(f"Location '{location}' not found on Glassdoor") raise ValueError(f"Location '{location}' not found on Glassdoor")
location_type = items[0]["locationType"] location_type = items[0]["locationType"]
if location_type == "C": if location_type == "C":
@ -282,8 +302,41 @@ class GlassdoorScraper(Scraper):
location_type = "STATE" location_type = "STATE"
elif location_type == "N": elif location_type == "N":
location_type = "COUNTRY" location_type = "COUNTRY"
return int(items[0]["locationId"]), location_type return int(items[0]["locationId"]), location_type
# Example string 'Tel Aviv, Israel'
def get_city_from_location(self, location:str) -> str:
return location.split(',')[0].strip() # Replace space with %2 to get "Tel%2Aviv"
def _get_locations(self, location: str, is_remote: bool) -> List[GlassDoorLocationResponse]:
if not location or is_remote:
return "11047", "STATE" # remote options
url = f"{self.base_url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
res = self.session.get(url)
if res.status_code != 200:
if res.status_code == 429:
err = f"429 Response - Blocked by Glassdoor for too many requests"
logger.error(err)
return None, None
else:
err = f"Glassdoor response status code {res.status_code}"
err += f" - {res.text}"
logger.error(f"Glassdoor response status code {res.status_code}")
return None, None
formatted_city = self.get_city_from_location(location)
items: List[GlassDoorLocationResponse] = [
GlassDoorLocationResponse(**item) for item in res.json()]
# Filter items based on the processed city name
items = [
item for item in items if item.label is not None and formatted_city in item.label
]
if not items:
logger.error(f"location not found in Glassdoor: {location}")
# raise ValueError(f"Location '{location}' not found on Glassdoor")
return items;
def _add_payload( def _add_payload(
self, self,
location_id: int, location_id: int,

View File

@ -7,9 +7,10 @@ jobs = scrape_jobs(
site_name=["glassdoor"], site_name=["glassdoor"],
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
location="Tel Aviv, israel", location="Central, Israel",
results_wanted=20, locations=["Tel Aviv, Israel","Ramat Gan, Israel","Central, Israel","Rehovot ,Israel"],
hours_old=72, results_wanted=200,
hours_old=200,
country_indeed='israel', country_indeed='israel',
# linkedin_fetch_description=True # gets more info such as description, direct job url (slower) # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)