diff --git a/README.md b/README.md index 76bbd21..4238921 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,13 @@ *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to work with us.* -\ + Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** – a Python package for real estate scraping* ## Features -- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously +- Scrapes job postings from **LinkedIn**, **Indeed**, **Glassdoor**, & **ZipRecruiter** simultaneously - Aggregates the job postings in a Pandas DataFrame - Proxy support (HTTP/S, SOCKS) @@ -35,15 +35,15 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) from jobspy import scrape_jobs jobs = scrape_jobs( - site_name=["indeed", "linkedin", "zip_recruiter"], + site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"], search_term="software engineer", location="Dallas, TX", results_wanted=10, - country_indeed='USA' # only needed for indeed + country_indeed='USA' # only needed for indeed / glassdoor ) print(f"Found {len(jobs)} jobs") print(jobs.head()) -jobs.to_csv("jobs.csv", index=False) # / to_xlsx +jobs.to_csv("jobs.csv", index=False) # to_xlsx ``` ### Output @@ -120,30 +120,31 @@ ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` param ### **Indeed** -Indeed supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` +Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` parameter to narrow down the location, e.g. city & state if necessary. -You can specify the following countries when searching on Indeed (use the exact name): +You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor): | | | | | |----------------------|--------------|------------|----------------| -| Argentina | Australia | Austria | Bahrain | -| Belgium | Brazil | Canada | Chile | +| Argentina | Australia* | Austria* | Bahrain | +| Belgium* | Brazil* | Canada* | Chile | | China | Colombia | Costa Rica | Czech Republic | | Denmark | Ecuador | Egypt | Finland | -| France | Germany | Greece | Hong Kong | -| Hungary | India | Indonesia | Ireland | -| Israel | Italy | Japan | Kuwait | -| Luxembourg | Malaysia | Mexico | Morocco | -| Netherlands | New Zealand | Nigeria | Norway | +| France* | Germany* | Greece | Hong Kong* | +| Hungary | India* | Indonesia | Ireland* | +| Israel | Italy* | Japan | Kuwait | +| Luxembourg | Malaysia | Mexico* | Morocco | +| Netherlands* | New Zealand* | Nigeria | Norway | | Oman | Pakistan | Panama | Peru | | Philippines | Poland | Portugal | Qatar | -| Romania | Saudi Arabia | Singapore | South Africa | -| South Korea | Spain | Sweden | Switzerland | +| Romania | Saudi Arabia | Singapore* | South Africa | +| South Korea | Spain* | Sweden | Switzerland* | | Taiwan | Thailand | Turkey | Ukraine | -| United Arab Emirates | UK | USA | Uruguay | +| United Arab Emirates | UK* | USA* | Uruguay | | Venezuela | Vietnam | | | + ## Frequently Asked Questions --- diff --git a/pyproject.toml b/pyproject.toml index 155cd8b..ebd4c1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.23" -description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" +version = "1.1.24" +description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" readme = "README.md" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 73d53c3..7c5fa64 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -6,18 +6,21 @@ from typing import Tuple, Optional from .jobs import JobType, Location from .scrapers.indeed import IndeedScraper from .scrapers.ziprecruiter import ZipRecruiterScraper +from .scrapers.glassdoor import GlassdoorScraper from .scrapers.linkedin import LinkedInScraper from .scrapers import ScraperInput, Site, JobResponse, Country from .scrapers.exceptions import ( LinkedInException, IndeedException, ZipRecruiterException, + GlassdoorException, ) SCRAPER_MAPPING = { Site.LINKEDIN: LinkedInScraper, Site.INDEED: IndeedScraper, Site.ZIP_RECRUITER: ZipRecruiterScraper, + Site.GLASSDOOR: GlassdoorScraper, } @@ -90,6 +93,8 @@ def scrape_jobs( raise IndeedException(str(e)) if site == Site.ZIP_RECRUITER: raise ZipRecruiterException(str(e)) + if site == Site.GLASSDOOR: + raise GlassdoorException(str(e)) else: raise e return site.value, scraped_data @@ -127,7 +132,10 @@ def scrape_jobs( job_data["emails"] = ( ", ".join(job_data["emails"]) if job_data["emails"] else None ) - job_data["location"] = Location(**job_data["location"]).display_location() + if job_data["location"]: + job_data["location"] = Location( + **job_data["location"] + ).display_location() compensation_obj = job_data.get("compensation") if compensation_obj and isinstance(compensation_obj, dict): diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 4131378..0737824 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -1,7 +1,6 @@ from typing import Union, Optional from datetime import date from enum import Enum - from pydantic import BaseModel, validator @@ -56,13 +55,13 @@ class JobType(Enum): class Country(Enum): - ARGENTINA = ("argentina", "ar") - AUSTRALIA = ("australia", "au") - AUSTRIA = ("austria", "at") + ARGENTINA = ("argentina", "com.ar") + AUSTRALIA = ("australia", "au", "com.au") + AUSTRIA = ("austria", "at", "at") BAHRAIN = ("bahrain", "bh") - BELGIUM = ("belgium", "be") - BRAZIL = ("brazil", "br") - CANADA = ("canada", "ca") + BELGIUM = ("belgium", "be", "nl:be") + BRAZIL = ("brazil", "br", "com.br") + CANADA = ("canada", "ca", "ca") CHILE = ("chile", "cl") CHINA = ("china", "cn") COLOMBIA = ("colombia", "co") @@ -72,24 +71,24 @@ class Country(Enum): ECUADOR = ("ecuador", "ec") EGYPT = ("egypt", "eg") FINLAND = ("finland", "fi") - FRANCE = ("france", "fr") - GERMANY = ("germany", "de") + FRANCE = ("france", "fr", "fr") + GERMANY = ("germany", "de", "de") GREECE = ("greece", "gr") - HONGKONG = ("hong kong", "hk") + HONGKONG = ("hong kong", "hk", "com.hk") HUNGARY = ("hungary", "hu") - INDIA = ("india", "in") + INDIA = ("india", "in", "co.in") INDONESIA = ("indonesia", "id") - IRELAND = ("ireland", "ie") + IRELAND = ("ireland", "ie", "ie") ISRAEL = ("israel", "il") - ITALY = ("italy", "it") + ITALY = ("italy", "it", "it") JAPAN = ("japan", "jp") KUWAIT = ("kuwait", "kw") LUXEMBOURG = ("luxembourg", "lu") MALAYSIA = ("malaysia", "malaysia") - MEXICO = ("mexico", "mx") + MEXICO = ("mexico", "mx", "com.mx") MOROCCO = ("morocco", "ma") - NETHERLANDS = ("netherlands", "nl") - NEWZEALAND = ("new zealand", "nz") + NETHERLANDS = ("netherlands", "nl", "nl") + NEWZEALAND = ("new zealand", "nz", "co.nz") NIGERIA = ("nigeria", "ng") NORWAY = ("norway", "no") OMAN = ("oman", "om") @@ -102,19 +101,19 @@ class Country(Enum): QATAR = ("qatar", "qa") ROMANIA = ("romania", "ro") SAUDIARABIA = ("saudi arabia", "sa") - SINGAPORE = ("singapore", "sg") + SINGAPORE = ("singapore", "sg", "sg") SOUTHAFRICA = ("south africa", "za") SOUTHKOREA = ("south korea", "kr") - SPAIN = ("spain", "es") + SPAIN = ("spain", "es", "es") SWEDEN = ("sweden", "se") - SWITZERLAND = ("switzerland", "ch") + SWITZERLAND = ("switzerland", "ch", "de:ch") TAIWAN = ("taiwan", "tw") THAILAND = ("thailand", "th") TURKEY = ("turkey", "tr") UKRAINE = ("ukraine", "ua") UNITEDARABEMIRATES = ("united arab emirates", "ae") - UK = ("uk", "uk") - USA = ("usa", "www") + UK = ("uk", "uk", "co.uk") + USA = ("usa", "www", "com") URUGUAY = ("uruguay", "uy") VENEZUELA = ("venezuela", "ve") VIETNAM = ("vietnam", "vn") @@ -125,31 +124,39 @@ class Country(Enum): # internal for linkeind WORLDWIDE = ("worldwide", "www") - def __new__(cls, country, domain): - obj = object.__new__(cls) - obj._value_ = country - obj.domain = domain - return obj + @property + def indeed_domain_value(self): + return self.value[1] @property - def domain_value(self): - return self.domain + def glassdoor_domain_value(self): + if len(self.value) == 3: + subdomain, _, domain = self.value[2].partition(":") + if subdomain and domain: + return f"{subdomain}.glassdoor.{domain}" + else: + return f"www.glassdoor.{self.value[2]}" + else: + raise Exception(f"Glassdoor is not available for {self.name}") + + def get_url(self): + return f"https://{self.glassdoor_domain_value}/" @classmethod def from_string(cls, country_str: str): """Convert a string to the corresponding Country enum.""" country_str = country_str.strip().lower() for country in cls: - if country.value == country_str: + if country.value[0] == country_str: return country valid_countries = [country.value for country in cls] raise ValueError( - f"Invalid country string: '{country_str}'. Valid countries (only include this param for Indeed) are: {', '.join(valid_countries)}" + f"Invalid country string: '{country_str}'. Valid countries are: {', '.join([country[0] for country in valid_countries])}" ) class Location(BaseModel): - country: Country = None + country: Country | None = None city: Optional[str] = None state: Optional[str] = None @@ -160,10 +167,10 @@ class Location(BaseModel): if self.state: location_parts.append(self.state) if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE): - if self.country.value in ("usa", "uk"): - location_parts.append(self.country.value.upper()) + if self.country.value[0] in ("usa", "uk"): + location_parts.append(self.country.value[0].upper()) else: - location_parts.append(self.country.value.title()) + location_parts.append(self.country.value[0].title()) return ", ".join(location_parts) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index dedd26f..97aaad0 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -6,6 +6,7 @@ class Site(Enum): LINKEDIN = "linkedin" INDEED = "indeed" ZIP_RECRUITER = "zip_recruiter" + GLASSDOOR = "glassdoor" class ScraperInput(BaseModel): diff --git a/src/jobspy/scrapers/exceptions.py b/src/jobspy/scrapers/exceptions.py index c6b1eea..e49680b 100644 --- a/src/jobspy/scrapers/exceptions.py +++ b/src/jobspy/scrapers/exceptions.py @@ -19,3 +19,8 @@ class IndeedException(Exception): class ZipRecruiterException(Exception): def __init__(self, message=None): super().__init__(message or "An error occurred with ZipRecruiter") + + +class GlassdoorException(Exception): + def __init__(self, message=None): + super().__init__(message or "An error occurred with Glassdoor") diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py new file mode 100644 index 0000000..52d4130 --- /dev/null +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -0,0 +1,279 @@ +""" +jobspy.scrapers.glassdoor +~~~~~~~~~~~~~~~~~~~ + +This module contains routines to scrape Glassdoor. +""" +import math +import time +import re +import json +from datetime import datetime, date +from typing import Optional, Tuple, Any +from bs4 import BeautifulSoup + +from .. import Scraper, ScraperInput, Site +from ..exceptions import GlassdoorException +from ..utils import count_urgent_words, extract_emails_from_text, create_session +from ...jobs import ( + JobPost, + Compensation, + CompensationInterval, + Location, + JobResponse, + JobType, + Country, +) + + +class GlassdoorScraper(Scraper): + def __init__(self, proxy: Optional[str] = None): + """ + Initializes GlassdoorScraper with the Glassdoor job search url + """ + site = Site(Site.ZIP_RECRUITER) + super().__init__(site, proxy=proxy) + + self.url = None + self.country = None + self.jobs_per_page = 30 + self.seen_urls = set() + + def fetch_jobs_page( + self, + scraper_input: ScraperInput, + location_id: int, + location_type: str, + page_num: int, + cursor: str | None, + ) -> (list[JobPost], str | None): + """ + Scrapes a page of Glassdoor for jobs with scraper_input criteria + :param scraper_input: + :return: jobs found on page + :return: cursor for next page + """ + try: + payload = self.add_payload( + scraper_input, location_id, location_type, page_num, cursor + ) + session = create_session(self.proxy, is_tls=False) + response = session.post( + f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload + ) + if response.status_code != 200: + raise GlassdoorException( + f"bad response status code: {response.status_code}" + ) + res_json = response.json()[0] + if "errors" in res_json: + raise ValueError("Error encountered in API response") + except Exception as e: + raise GlassdoorException(str(e)) + + jobs_data = res_json["data"]["jobListings"]["jobListings"] + + jobs = [] + for i, job in enumerate(jobs_data): + job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][ + "linkItems" + ][i]["url"] + job = job["jobview"] + title = job["job"]["jobTitleText"] + company_name = job["header"]["employerNameFromSearch"] + location_name = job["header"].get("locationName", "") + location_type = job["header"].get("locationType", "") + is_remote = False + location = None + + if location_type == "S": + is_remote = True + else: + location = self.parse_location(location_name) + + compensation = self.parse_compensation(job["header"]) + + job = JobPost( + title=title, + company_name=company_name, + job_url=job_url, + location=location, + compensation=compensation, + is_remote=is_remote, + ) + jobs.append(job) + + return jobs, self.get_cursor_for_page( + res_json["data"]["jobListings"]["paginationCursors"], page_num + 1 + ) + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """ + Scrapes Glassdoor for jobs with scraper_input criteria. + :param scraper_input: Information about job search criteria. + :return: JobResponse containing a list of jobs. + """ + self.country = scraper_input.country + self.url = self.country.get_url() + + location_id, location_type = self.get_location( + scraper_input.location, scraper_input.is_remote + ) + all_jobs: list[JobPost] = [] + cursor = None + max_pages = 30 + + try: + for page in range( + 1 + (scraper_input.offset // self.jobs_per_page), + min( + (scraper_input.results_wanted // self.jobs_per_page) + 2, + max_pages + 1, + ), + ): + try: + jobs, cursor = self.fetch_jobs_page( + scraper_input, location_id, location_type, page, cursor + ) + all_jobs.extend(jobs) + if len(all_jobs) >= scraper_input.results_wanted: + all_jobs = all_jobs[: scraper_input.results_wanted] + break + except Exception as e: + print(f"Page {page} generated an exception: {e}") + except Exception as e: + print(f"An exception occurred: {e}") + + return JobResponse(jobs=all_jobs) + + @staticmethod + def parse_compensation(data: dict) -> Optional[Compensation]: + pay_period = data.get("payPeriod") + adjusted_pay = data.get("payPeriodAdjustedPay") + currency = data.get("payCurrency", "USD") + + if not pay_period or not adjusted_pay: + return None + + interval = None + if pay_period == "ANNUAL": + interval = CompensationInterval.YEARLY + elif pay_period == "MONTHLY": + interval = CompensationInterval.MONTHLY + elif pay_period == "WEEKLY": + interval = CompensationInterval.WEEKLY + elif pay_period == "DAILY": + interval = CompensationInterval.DAILY + elif pay_period == "HOURLY": + interval = CompensationInterval.HOURLY + + min_amount = int(adjusted_pay.get("p10") // 1) + max_amount = int(adjusted_pay.get("p90") // 1) + + return Compensation( + interval=interval, + min_amount=min_amount, + max_amount=max_amount, + currency=currency, + ) + + def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None: + for job_type in JobType: + if job_type_str in job_type.value: + return [job_type] + return None + + def get_location(self, location: str, is_remote: bool) -> (int, str): + if not location or is_remote: + return "11047", "S" # remote options + url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}" + session = create_session(self.proxy) + response = session.get(url) + if response.status_code != 200: + raise GlassdoorException( + f"bad response status code: {response.status_code}" + ) + items = response.json() + if not items: + raise ValueError(f"Location '{location}' not found on Glassdoor") + return int(items[0]["locationId"]), items[0]["locationType"] + + @staticmethod + def add_payload( + scraper_input, + location_id: int, + location_type: str, + page_num: int, + cursor: str | None = None, + ) -> dict[str, str | Any]: + payload = { + "operationName": "JobSearchResultsQuery", + "variables": { + "excludeJobListingIds": [], + "filterParams": [], + "keyword": scraper_input.search_term, + "numJobsToShow": 30, + "originalPageUrl": "https://www.glassdoor.com/Job/software-intern-jobs-SRCH_KO0,15.htm", + "parameterUrlInput": f"IL.0,12_I{location_type}{location_id}", + "seoFriendlyUrlInput": "software-intern-jobs", + "seoUrl": True, + "pageNumber": page_num, + "pageCursor": cursor, + }, + "query": "query JobSearchResultsQuery($excludeJobListingIds: [Long!], $keyword: String, $locationId: Int, $locationType: LocationTypeEnum, $numJobsToShow: Int!, $pageCursor: String, $pageNumber: Int, $filterParams: [FilterParams], $originalPageUrl: String, $seoFriendlyUrlInput: String, $parameterUrlInput: String, $seoUrl: Boolean) {\n jobListings(\n contextHolder: {searchParams: {excludeJobListingIds: $excludeJobListingIds, keyword: $keyword, locationId: $locationId, locationType: $locationType, numPerPage: $numJobsToShow, pageCursor: $pageCursor, pageNumber: $pageNumber, filterParams: $filterParams, originalPageUrl: $originalPageUrl, seoFriendlyUrlInput: $seoFriendlyUrlInput, parameterUrlInput: $parameterUrlInput, seoUrl: $seoUrl, searchType: SR}}\n ) {\n companyFilterOptions {\n id\n shortName\n __typename\n }\n filterOptions\n indeedCtk\n jobListings {\n ...JobView\n __typename\n }\n jobListingSeoLinks {\n linkItems {\n position\n url\n __typename\n }\n __typename\n }\n jobSearchTrackingKey\n jobsPageSeoData {\n pageMetaDescription\n pageTitle\n __typename\n }\n paginationCursors {\n cursor\n pageNumber\n __typename\n }\n indexablePageForSeo\n searchResultsMetadata {\n searchCriteria {\n implicitLocation {\n id\n localizedDisplayName\n type\n __typename\n }\n keyword\n location {\n id\n shortName\n localizedShortName\n localizedDisplayName\n type\n __typename\n }\n __typename\n }\n footerVO {\n countryMenu {\n childNavigationLinks {\n id\n link\n textKey\n __typename\n }\n __typename\n }\n __typename\n }\n helpCenterDomain\n helpCenterLocale\n jobAlert {\n jobAlertExists\n __typename\n }\n jobSerpFaq {\n questions {\n answer\n question\n __typename\n }\n __typename\n }\n jobSerpJobOutlook {\n occupation\n paragraph\n __typename\n }\n showMachineReadableJobs\n __typename\n }\n serpSeoLinksVO {\n relatedJobTitlesResults\n searchedJobTitle\n searchedKeyword\n searchedLocationIdAsString\n searchedLocationSeoName\n searchedLocationType\n topCityIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerNameResults\n topOccupationResults\n __typename\n }\n totalJobsCount\n __typename\n }\n}\n\nfragment JobView on JobListingSearchResult {\n jobview {\n header {\n adOrderId\n advertiserType\n adOrderSponsorshipLevel\n ageInDays\n divisionEmployerName\n easyApply\n employer {\n id\n name\n shortName\n __typename\n }\n employerNameFromSearch\n goc\n gocConfidence\n gocId\n jobCountryId\n jobLink\n jobResultTrackingKey\n jobTitleText\n locationName\n locationType\n locId\n needsCommission\n payCurrency\n payPeriod\n payPeriodAdjustedPay {\n p10\n p50\n p90\n __typename\n }\n rating\n salarySource\n savedJobId\n sponsored\n __typename\n }\n job {\n descriptionFragments\n importConfigId\n jobTitleId\n jobTitleText\n listingId\n __typename\n }\n jobListingAdminDetails {\n cpcVal\n importConfigId\n jobListingId\n jobSourceId\n userEligibleForAdminJobDetails\n __typename\n }\n overview {\n shortName\n squareLogoUrl\n __typename\n }\n __typename\n }\n __typename\n}\n", + } + + job_type_filters = { + JobType.FULL_TIME: "fulltime", + JobType.PART_TIME: "parttime", + JobType.CONTRACT: "contract", + JobType.INTERNSHIP: "internship", + JobType.TEMPORARY: "temporary", + } + + if scraper_input.job_type in job_type_filters: + filter_value = job_type_filters[scraper_input.job_type] + payload["variables"]["filterParams"].append( + {"filterKey": "jobType", "values": filter_value} + ) + + return json.dumps([payload]) + + def parse_location(self, location_name: str) -> Location: + if not location_name or location_name == "Remote": + return None + city, _, state = location_name.partition(", ") + return Location(city=city, state=state) + + @staticmethod + def get_cursor_for_page(pagination_cursors, page_num): + for cursor_data in pagination_cursors: + if cursor_data["pageNumber"] == page_num: + return cursor_data["cursor"] + return None + + @staticmethod + def headers() -> dict: + """ + Returns headers needed for requests + :return: dict - Dictionary containing headers + """ + return { + "authority": "www.glassdoor.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "apollographql-client-name": "job-search-next", + "apollographql-client-version": "4.65.5", + "content-type": "application/json", + "cookie": 'gdId=91e2dfc4-c8b5-4fa7-83d0-11512b80262c; G_ENABLED_IDPS=google; trs=https%3A%2F%2Fwww.redhat.com%2F:referral:referral:2023-07-05+09%3A50%3A14.862:undefined:undefined; g_state={"i_p":1688587331651,"i_l":1}; _cfuvid=.7llazxhYFZWi6EISSPdVjtqF0NMVwzxr_E.cB1jgLs-1697828392979-0-604800000; GSESSIONID=undefined; JSESSIONID=F03DD1B5EE02DB6D842FE42B142F88F3; cass=1; jobsClicked=true; indeedCtk=1hd77b301k79i801; asst=1697829114.2; G_AUTHUSER_H=0; uc=8013A8318C98C517FE6DD0024636DFDEF978FC33266D93A2FAFEF364EACA608949D8B8FA2DC243D62DE271D733EB189D809ABE5B08D7B1AE865D217BD4EEBB97C282F5DA5FEFE79C937E3F6110B2A3A0ADBBA3B4B6DF5A996FEE00516100A65FCB11DA26817BE8D1C1BF6CFE36B5B68A3FDC2CFEC83AB797F7841FBB157C202332FC7E077B56BD39B167BDF3D9866E3B; AWSALB=zxc/Yk1nbWXXT6HjNyn3H4h4950ckVsFV/zOrq5LSoChYLE1qV+hDI8Axi3fUa9rlskndcO0M+Fw+ZnJ+AQ2afBFpyOd1acouLMYgkbEpqpQaWhY6/Gv4QH1zBcJ; AWSALBCORS=zxc/Yk1nbWXXT6HjNyn3H4h4950ckVsFV/zOrq5LSoChYLE1qV+hDI8Axi3fUa9rlskndcO0M+Fw+ZnJ+AQ2afBFpyOd1acouLMYgkbEpqpQaWhY6/Gv4QH1zBcJ; gdsid=1697828393025:1697830776351:668396EDB9E6A832022D34414128093D; at=HkH8Hnqi9uaMC7eu0okqyIwqp07ht9hBvE1_St7E_hRqPvkO9pUeJ1Jcpds4F3g6LL5ADaCNlxrPn0o6DumGMfog8qI1-zxaV_jpiFs3pugntw6WpVyYWdfioIZ1IDKupyteeLQEM1AO4zhGjY_rPZynpsiZBPO_B1au94sKv64rv23yvP56OiWKKfI-8_9hhLACEwWvM-Az7X-4aE2QdFt93VJbXbbGVf07bdDZfimsIkTtgJCLSRhU1V0kEM1Efyu66vo3m77gFFaMW7lxyYnb36I5PdDtEXBm3aL-zR7-qa5ywd94ISEivgqQOA4FPItNhqIlX4XrfD1lxVz6rfPaoTIDi4DI6UMCUjwyPsuv8mn0rYqDfRnmJpZ97fJ5AnhrknAd_6ZWN5v1OrxJczHzcXd8LO820QPoqxzzG13bmSTXLwGSxMUCtSrVsq05hicimQ3jpRt0c1dA4OkTNqF7_770B9JfcHcM8cr8-C4IL56dnOjr9KBGfN1Q2IvZM2cOBRbV7okiNOzKVZ3qJ24AE34WA2F3U6Whiu6H8nIuGG5hSNkVygY6CtglNZfFF9p8pJAZm79PngrrBv-CXFBZmhYLFo46lmFetDkiJ6mirtez4tKpzTIYjIp4_JAkiZFwbLJ2QGH4mK8kyyW0lZiX1DTuQec50N_5wvRo0Gt7nlKxzLsApMnaNhuQeH5ygh_pa381ORo9mQGi0EYF9zk00pa2--z4PtjfQ8KFq36GgpxKy5-o4qgqygZj8F01L8r-FiX2G4C7PREMIpAyHX2A4-_JxA1IS2j12EyqKTLqE9VcP06qm2Z-YuIW3ctmpMxy5G9_KiEiGv17weizhSFnl6SbpAEY-2VSmQ5V6jm3hoMp2jemkuGCRkZeFstLDEPxlzFN7WM; __cf_bm=zGaVjIJw4irf40_7UVw54B6Ohm271RUX4Tc8KVScrbs-1697830777-0-AYv2GnKTnnCU+cY9xHbJunO0DwlLDO6SIBnC/s/qldpKsGK0rRAjD6y8lbyATT/KlS7g29OZaN4fbd0lrJg0KmWbIybZIzfWVLHSYePVuOhu; asst=1697829114.2; at=dFhXf64wsf2TlnWy41xLs7skJkuxgKToEGcjGtDfUvW4oEAJ4tTIR5dKQ8wbwT75aIaGgdCfvcb-da7vwrCGWscCncmfLFQpJ9l-LLwoRfk-pMsxHhd77wvf-W7I0HSm7-Q5lQJqI9WyNGRxOa-RpzBTf4L8_Et4-3FzjPaAoYY5pY1FhuwXbN5asGOAMW-p8cjpbfn3PumlIYuckguWnjrcY2F31YJ_1noeoHM9tCGpymANbqGXRkG6aXY7yCfVXtdgZU1K5SMeaSPZIuF_iLUxjc_corzpNiH6qq7BIAmh-e5Aa-g7cwpZcln1fmwTVw4uTMZf1eLIMTa9WzgqZNkvG-sGaq_XxKA_Wai6xTTkOHfRgm4632Ba2963wdJvkGmUUa3tb_L4_wTgk3eFnHp5JhghLfT2Pe3KidP-yX__vx8JOsqe3fndCkKXgVz7xQKe1Dur-sMNlGwi4LXfguTT2YUI8C5Miq3pj2IHc7dC97eyyAiAM4HvyGWfaXWZcei6oIGrOwMvYgy0AcwFry6SIP2SxLT5TrxinRRuem1r1IcOTJsMJyUPp1QsZ7bOyq9G_0060B4CPyovw5523hEuqLTM-R5e5yavY6C_1DHUyE15C3mrh7kdvmlGZeflnHqkFTEKwwOftm-Mv-CKD5Db9ABFGNxKB2FH7nDH67hfOvm4tGNMzceBPKYJ3wciTt9jK3wy39_7cOYVywfrZ-oLhw_XtsbGSSeGn3HytrfgSADAh2sT0Gg6eCC9Xy1vh-Za337SVLUDXZ73W2xJxxUHBkFzZs8L_Xndo5DsbpWhVs9IYUGyraJdqB3SLgDbAppIBCJl4fx6_DG8-xOQPBvuFMlTROe1JVdHOzXI1GElwFDTuH1pjkg4I2G0NhAbE06Y-1illQE; gdsid=1697828393025:1697831731408:99C30D94108AC3030D61C736DDCDF11C', + "gd-csrf-token": "Ft6oHEWlRZrxDww95Cpazw:0pGUrkb2y3TyOpAIqF2vbPmUXoXVkD3oEGDVkvfeCerceQ5-n8mBg3BovySUIjmCPHCaW0H2nQVdqzbtsYqf4Q:wcqRqeegRUa9MVLJGyujVXB7vWFPjdaS1CtrrzJq-ok", + "origin": "https://www.glassdoor.com", + "referer": "https://www.glassdoor.com/", + "sec-ch-ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"macOS"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", + } diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 297cfe1..61c19f7 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -56,7 +56,7 @@ class IndeedScraper(Scraper): :return: jobs found on page, total number of jobs found for search """ self.country = scraper_input.country - domain = self.country.domain_value + domain = self.country.indeed_domain_value self.url = f"https://{domain}.indeed.com" params = { @@ -258,12 +258,8 @@ class IndeedScraper(Scraper): except (KeyError, TypeError, IndexError): return None - soup = BeautifulSoup( - job_description, "html.parser" - ) - text_content = " ".join( - soup.get_text(separator=" ").split() - ).strip() + soup = BeautifulSoup(job_description, "html.parser") + text_content = " ".join(soup.get_text(separator=" ").split()).strip() return text_content diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 28e5331..26d4390 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -18,12 +18,7 @@ from threading import Lock from .. import Scraper, ScraperInput, Site from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type from ..exceptions import LinkedInException -from ...jobs import ( - JobPost, - Location, - JobResponse, - JobType, -) +from ...jobs import JobPost, Location, JobResponse, JobType, Country class LinkedInScraper(Scraper): @@ -181,7 +176,6 @@ class LinkedInScraper(Scraper): location=location, date_posted=date_posted, job_url=job_url, - # job_type=[JobType.FULL_TIME], job_type=job_type, benefits=benefits, emails=extract_emails_from_text(description) if description else None, @@ -246,7 +240,7 @@ class LinkedInScraper(Scraper): :param metadata_card :return: location """ - location = Location(country=self.country) + location = Location(country=Country.from_string(self.country)) if metadata_card is not None: location_tag = metadata_card.find( "span", class_="job-search-card__location" @@ -258,7 +252,7 @@ class LinkedInScraper(Scraper): location = Location( city=city, state=state, - country=self.country, + country=Country.from_string(self.country), ) return location diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index b999746..7559ade 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -16,13 +16,13 @@ from concurrent.futures import ThreadPoolExecutor from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException from ..utils import count_urgent_words, extract_emails_from_text, create_session -from ...jobs import JobPost, Compensation, Location, JobResponse, JobType +from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country class ZipRecruiterScraper(Scraper): def __init__(self, proxy: Optional[str] = None): """ - Initializes LinkedInScraper with the ZipRecruiter job search url + Initializes ZipRecruiterScraper with the ZipRecruiter job search url """ site = Site(Site.ZIP_RECRUITER) self.url = "https://www.ziprecruiter.com" @@ -31,7 +31,9 @@ class ZipRecruiterScraper(Scraper): self.jobs_per_page = 20 self.seen_urls = set() - def find_jobs_in_page(self, scraper_input: ScraperInput, continue_token: str | None = None) -> Tuple[list[JobPost], Optional[str]]: + def find_jobs_in_page( + self, scraper_input: ScraperInput, continue_token: str | None = None + ) -> Tuple[list[JobPost], Optional[str]]: """ Scrapes a page of ZipRecruiter for jobs with scraper_input criteria :param scraper_input: @@ -40,7 +42,7 @@ class ZipRecruiterScraper(Scraper): """ params = self.add_params(scraper_input) if continue_token: - params['continue'] = continue_token + params["continue"] = continue_token try: session = create_session(self.proxy, is_tls=False) response = session.get( @@ -61,13 +63,10 @@ class ZipRecruiterScraper(Scraper): time.sleep(5) response_data = response.json() jobs_list = response_data.get("jobs", []) - next_continue_token = response_data.get('continue', None) + next_continue_token = response_data.get("continue", None) with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor: - job_results = [ - executor.submit(self.process_job, job) - for job in jobs_list - ] + job_results = [executor.submit(self.process_job, job) for job in jobs_list] job_list = [result.result() for result in job_results if result.result()] return job_list, next_continue_token @@ -87,7 +86,9 @@ class ZipRecruiterScraper(Scraper): if len(job_list) >= scraper_input.results_wanted: break - jobs_on_page, continue_token = self.find_jobs_in_page(scraper_input, continue_token) + jobs_on_page, continue_token = self.find_jobs_in_page( + scraper_input, continue_token + ) if jobs_on_page: job_list.extend(jobs_on_page) @@ -95,13 +96,13 @@ class ZipRecruiterScraper(Scraper): break if len(job_list) > scraper_input.results_wanted: - job_list = job_list[:scraper_input.results_wanted] + job_list = job_list[: scraper_input.results_wanted] return JobResponse(jobs=job_list) @staticmethod def process_job(job: dict) -> JobPost: - """ Processes an individual job dict from the response """ + """Processes an individual job dict from the response""" title = job.get("name") job_url = job.get("job_url") @@ -109,9 +110,12 @@ class ZipRecruiterScraper(Scraper): job.get("job_description", "").strip(), "html.parser" ).get_text() - company = job['hiring_company'].get("name") if "hiring_company" in job else None + company = job["hiring_company"].get("name") if "hiring_company" in job else None + country_value = "usa" if job.get("job_country") == "US" else "canada" + country_enum = Country.from_string(country_value) + location = Location( - city=job.get("job_city"), state=job.get("job_state"), country='usa' if job.get("job_country") == 'US' else 'canada' + city=job.get("job_city"), state=job.get("job_state"), country=country_enum ) job_type = ZipRecruiterScraper.get_job_type_enum( job.get("employment_type", "").replace("_", "").lower() @@ -134,9 +138,15 @@ class ZipRecruiterScraper(Scraper): location=location, job_type=job_type, compensation=Compensation( - interval="yearly" if job.get("compensation_interval") == "annual" else job.get("compensation_interval"), - min_amount=int(job["compensation_min"]) if "compensation_min" in job else None, - max_amount=int(job["compensation_max"]) if "compensation_max" in job else None, + interval="yearly" + if job.get("compensation_interval") == "annual" + else job.get("compensation_interval"), + min_amount=int(job["compensation_min"]) + if "compensation_min" in job + else None, + max_amount=int(job["compensation_max"]) + if "compensation_max" in job + else None, currency=job.get("compensation_currency"), ), date_posted=date_posted, @@ -189,13 +199,13 @@ class ZipRecruiterScraper(Scraper): :return: dict - Dictionary containing headers """ return { - 'Host': 'api.ziprecruiter.com', - 'Cookie': 'ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38', - 'accept': '*/*', - 'x-zr-zva-override': '100000000;vid:ZT1huzm_EQlDTVEc', - 'x-pushnotificationid': '0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0', - 'x-deviceid': 'D77B3A92-E589-46A4-8A39-6EF6F1D86006', - 'user-agent': 'Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)', - 'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==', - 'accept-language': 'en-US,en;q=0.9' + "Host": "api.ziprecruiter.com", + "Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38", + "accept": "*/*", + "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc", + "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0", + "x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006", + "user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)", + "authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==", + "accept-language": "en-US,en;q=0.9", } diff --git a/src/tests/test_all.py b/src/tests/test_all.py index 5ffd333..c34524e 100644 --- a/src/tests/test_all.py +++ b/src/tests/test_all.py @@ -4,7 +4,7 @@ import pandas as pd def test_all(): result = scrape_jobs( - site_name=["linkedin", "indeed", "zip_recruiter"], + site_name=["linkedin", "indeed", "zip_recruiter", "glassdoor"], search_term="software engineer", results_wanted=5, ) diff --git a/src/tests/test_glassdoor.py b/src/tests/test_glassdoor.py new file mode 100644 index 0000000..6ef36ae --- /dev/null +++ b/src/tests/test_glassdoor.py @@ -0,0 +1,11 @@ +from ..jobspy import scrape_jobs +import pandas as pd + + +def test_indeed(): + result = scrape_jobs( + site_name="glassdoor", search_term="software engineer", country_indeed="USA" + ) + assert ( + isinstance(result, pd.DataFrame) and not result.empty + ), "Result should be a non-empty DataFrame" diff --git a/src/tests/test_indeed.py b/src/tests/test_indeed.py index 280215f..2eef36b 100644 --- a/src/tests/test_indeed.py +++ b/src/tests/test_indeed.py @@ -4,8 +4,7 @@ import pandas as pd def test_indeed(): result = scrape_jobs( - site_name="indeed", - search_term="software engineer", + site_name="indeed", search_term="software engineer", country_indeed="usa" ) assert ( isinstance(result, pd.DataFrame) and not result.empty