From e8db57695ca8b71ebe8ed524ec12efbbcf61778a Mon Sep 17 00:00:00 2001 From: VitaminB16 Date: Fri, 9 Feb 2024 15:04:12 +0000 Subject: [PATCH] feat: Ability to query by time posted for linkedin, indeed, glassdoor --- src/jobspy/__init__.py | 3 +++ src/jobspy/scrapers/__init__.py | 4 +-- src/jobspy/scrapers/glassdoor/__init__.py | 3 +++ src/jobspy/scrapers/indeed/__init__.py | 5 +++- src/jobspy/scrapers/linkedin/__init__.py | 33 ++++++++++++++--------- 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index cf0222b..0c63b54 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -43,6 +43,8 @@ def scrape_jobs( full_description: bool | None = False, linkedin_company_ids: list[int] | None = None, offset: int | None = 0, + hours_old: int = None, + **kwargs, ) -> pd.DataFrame: """ Simultaneously scrapes job data from multiple job sites. @@ -85,6 +87,7 @@ def scrape_jobs( results_wanted=results_wanted, linkedin_company_ids=linkedin_company_ids, offset=offset, + hours_old=hours_old ) def scrape_site(site: Site) -> Tuple[str, JobResponse]: diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index fc8c633..f180d0d 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -23,6 +23,7 @@ class ScraperInput(BaseModel): linkedin_company_ids: list[int] | None = None results_wanted: int = 15 + hours_old: int | None = None class Scraper: @@ -30,5 +31,4 @@ class Scraper: self.site = site self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy) - def scrape(self, scraper_input: ScraperInput) -> JobResponse: - ... + def scrape(self, scraper_input: ScraperInput) -> JobResponse: ... diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 893357c..cf539e0 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -258,6 +258,8 @@ class GlassdoorScraper(Scraper): page_num: int, cursor: str | None = None, ) -> str: + # `fromage` is the posting time filter in days + fromage = min(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None payload = { "operationName": "JobSearchResultsQuery", "variables": { @@ -270,6 +272,7 @@ class GlassdoorScraper(Scraper): "parameterUrlInput": f"IL.0,12_I{location_type}{location_id}", "pageNumber": page_num, "pageCursor": cursor, + "fromAge": fromage }, "query": "query JobSearchResultsQuery($excludeJobListingIds: [Long!], $keyword: String, $locationId: Int, $locationType: LocationTypeEnum, $numJobsToShow: Int!, $pageCursor: String, $pageNumber: Int, $filterParams: [FilterParams], $originalPageUrl: String, $seoFriendlyUrlInput: String, $parameterUrlInput: String, $seoUrl: Boolean) {\n jobListings(\n contextHolder: {searchParams: {excludeJobListingIds: $excludeJobListingIds, keyword: $keyword, locationId: $locationId, locationType: $locationType, numPerPage: $numJobsToShow, pageCursor: $pageCursor, pageNumber: $pageNumber, filterParams: $filterParams, originalPageUrl: $originalPageUrl, seoFriendlyUrlInput: $seoFriendlyUrlInput, parameterUrlInput: $parameterUrlInput, seoUrl: $seoUrl, searchType: SR}}\n ) {\n companyFilterOptions {\n id\n shortName\n __typename\n }\n filterOptions\n indeedCtk\n jobListings {\n ...JobView\n __typename\n }\n jobListingSeoLinks {\n linkItems {\n position\n url\n __typename\n }\n __typename\n }\n jobSearchTrackingKey\n jobsPageSeoData {\n pageMetaDescription\n pageTitle\n __typename\n }\n paginationCursors {\n cursor\n pageNumber\n __typename\n }\n indexablePageForSeo\n searchResultsMetadata {\n searchCriteria {\n implicitLocation {\n id\n localizedDisplayName\n type\n __typename\n }\n keyword\n location {\n id\n shortName\n localizedShortName\n localizedDisplayName\n type\n __typename\n }\n __typename\n }\n footerVO {\n countryMenu {\n childNavigationLinks {\n id\n link\n textKey\n __typename\n }\n __typename\n }\n __typename\n }\n helpCenterDomain\n helpCenterLocale\n jobAlert {\n jobAlertExists\n __typename\n }\n jobSerpFaq {\n questions {\n answer\n question\n __typename\n }\n __typename\n }\n jobSerpJobOutlook {\n occupation\n paragraph\n __typename\n }\n showMachineReadableJobs\n __typename\n }\n serpSeoLinksVO {\n relatedJobTitlesResults\n searchedJobTitle\n searchedKeyword\n searchedLocationIdAsString\n searchedLocationSeoName\n searchedLocationType\n topCityIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerNameResults\n topOccupationResults\n __typename\n }\n totalJobsCount\n __typename\n }\n}\n\nfragment JobView on JobListingSearchResult {\n jobview {\n header {\n adOrderId\n advertiserType\n adOrderSponsorshipLevel\n ageInDays\n divisionEmployerName\n easyApply\n employer {\n id\n name\n shortName\n __typename\n }\n employerNameFromSearch\n goc\n gocConfidence\n gocId\n jobCountryId\n jobLink\n jobResultTrackingKey\n jobTitleText\n locationName\n locationType\n locId\n needsCommission\n payCurrency\n payPeriod\n payPeriodAdjustedPay {\n p10\n p50\n p90\n __typename\n }\n rating\n salarySource\n savedJobId\n sponsored\n __typename\n }\n job {\n descriptionFragments\n importConfigId\n jobTitleId\n jobTitleText\n listingId\n __typename\n }\n jobListingAdminDetails {\n cpcVal\n importConfigId\n jobListingId\n jobSourceId\n userEligibleForAdminJobDetails\n __typename\n }\n overview {\n shortName\n squareLogoUrl\n __typename\n }\n __typename\n }\n __typename\n}\n", } diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 9e21e70..02fc39a 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -346,12 +346,15 @@ class IndeedScraper(Scraper): @staticmethod def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]: + # `fromage` is the posting time filter in days + fromage = min(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None params = { "q": scraper_input.search_term, "l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1], "filter": 0, "start": scraper_input.offset + page * 10, - "sort": "date" + "sort": "date", + "fromage": fromage, } if scraper_input.distance: params["radius"] = scraper_input.distance diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 19b6173..af52a06 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -59,6 +59,12 @@ class LinkedInScraper(Scraper): url_lock = Lock() page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0 + seconds_old = ( + scraper_input.hours_old * 3600 + if scraper_input.hours_old + else None + ) + def job_type_code(job_type_enum): mapping = { JobType.FULL_TIME: "F", @@ -85,7 +91,8 @@ class LinkedInScraper(Scraper): "pageNum": 0, "start": page + scraper_input.offset, "f_AL": "true" if scraper_input.easy_apply else None, - "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None + "f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None, + "f_TPR": f"r{seconds_old}", } params = {k: v for k, v in params.items() if v is not None} @@ -101,7 +108,9 @@ class LinkedInScraper(Scraper): response.raise_for_status() except requests.HTTPError as e: - raise LinkedInException(f"bad response status code: {e.response.status_code}") + raise LinkedInException( + f"bad response status code: {e.response.status_code}" + ) except ProxyError as e: raise LinkedInException("bad proxy") except Exception as e: @@ -145,11 +154,11 @@ class LinkedInScraper(Scraper): compensation = None if salary_tag: - salary_text = salary_tag.get_text(separator=' ').strip() - salary_values = [currency_parser(value) for value in salary_text.split('-')] + salary_text = salary_tag.get_text(separator=" ").strip() + salary_values = [currency_parser(value) for value in salary_text.split("-")] salary_min = salary_values[0] salary_max = salary_values[1] - currency = salary_text[0] if salary_text[0] != '$' else 'USD' + currency = salary_text[0] if salary_text[0] != "$" else "USD" compensation = Compensation( min_amount=int(salary_min), @@ -294,17 +303,17 @@ class LinkedInScraper(Scraper): @staticmethod def headers() -> dict: return { - 'authority': 'www.linkedin.com', - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'en-US,en;q=0.9', - 'cache-control': 'max-age=0', - 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + "authority": "www.linkedin.com", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-US,en;q=0.9", + "cache-control": "max-age=0", + "sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', # 'sec-ch-ua-mobile': '?0', # 'sec-ch-ua-platform': '"macOS"', # 'sec-fetch-dest': 'document', # 'sec-fetch-mode': 'navigate', # 'sec-fetch-site': 'none', # 'sec-fetch-user': '?1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", }