mirror of https://github.com/Bunsly/JobSpy
feat: Ability to query by time posted for linkedin, indeed, glassdoor, ziprecruiter (#103)
parent
2563c5ca08
commit
91b137ef86
|
@ -29,18 +29,20 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import csv
|
||||||
from jobspy import scrape_jobs
|
from jobspy import scrape_jobs
|
||||||
|
|
||||||
jobs = scrape_jobs(
|
jobs = scrape_jobs(
|
||||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
||||||
search_term="software engineer",
|
search_term="software engineer",
|
||||||
location="Dallas, TX",
|
location="Dallas, TX",
|
||||||
results_wanted=10,
|
results_wanted=20,
|
||||||
|
hours_old=72, # (only linkedin is hour specific, others round up to days old)
|
||||||
country_indeed='USA' # only needed for indeed / glassdoor
|
country_indeed='USA' # only needed for indeed / glassdoor
|
||||||
)
|
)
|
||||||
print(f"Found {len(jobs)} jobs")
|
print(f"Found {len(jobs)} jobs")
|
||||||
print(jobs.head())
|
print(jobs.head())
|
||||||
jobs.to_csv("jobs.csv", index=False) # to_xlsx
|
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
|
||||||
```
|
```
|
||||||
|
|
||||||
### Output
|
### Output
|
||||||
|
@ -73,6 +75,7 @@ Optional
|
||||||
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
|
├── linkedin_company_ids (list[int): searches for linkedin jobs with specific company ids
|
||||||
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||||
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
|
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
|
||||||
|
├── hours_old (int): filters jobs by the number of hours since the job was posted (all but LinkedIn rounds up to next day)
|
||||||
```
|
```
|
||||||
|
|
||||||
### JobPost Schema
|
### JobPost Schema
|
||||||
|
|
|
@ -42,6 +42,8 @@ def scrape_jobs(
|
||||||
full_description: bool | None = False,
|
full_description: bool | None = False,
|
||||||
linkedin_company_ids: list[int] | None = None,
|
linkedin_company_ids: list[int] | None = None,
|
||||||
offset: int | None = 0,
|
offset: int | None = 0,
|
||||||
|
hours_old: int = None,
|
||||||
|
**kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Simultaneously scrapes job data from multiple job sites.
|
Simultaneously scrapes job data from multiple job sites.
|
||||||
|
@ -84,6 +86,7 @@ def scrape_jobs(
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
linkedin_company_ids=linkedin_company_ids,
|
linkedin_company_ids=linkedin_company_ids,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
|
hours_old=hours_old
|
||||||
)
|
)
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
|
@ -189,4 +192,4 @@ def scrape_jobs(
|
||||||
else:
|
else:
|
||||||
jobs_formatted_df = pd.DataFrame()
|
jobs_formatted_df = pd.DataFrame()
|
||||||
|
|
||||||
return jobs_formatted_df
|
return jobs_formatted_df.sort_values(by='date_posted', ascending=False)
|
||||||
|
|
|
@ -23,6 +23,7 @@ class ScraperInput(BaseModel):
|
||||||
linkedin_company_ids: list[int] | None = None
|
linkedin_company_ids: list[int] | None = None
|
||||||
|
|
||||||
results_wanted: int = 15
|
results_wanted: int = 15
|
||||||
|
hours_old: int | None = None
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
|
@ -30,5 +31,4 @@ class Scraper:
|
||||||
self.site = site
|
self.site = site
|
||||||
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
self.proxy = (lambda p: {"http": p, "https": p} if p else None)(proxy)
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse: ...
|
||||||
...
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ class GlassdoorScraper(Scraper):
|
||||||
location_type = job["header"].get("locationType", "")
|
location_type = job["header"].get("locationType", "")
|
||||||
age_in_days = job["header"].get("ageInDays")
|
age_in_days = job["header"].get("ageInDays")
|
||||||
is_remote, location = False, None
|
is_remote, location = False, None
|
||||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days is not None else None
|
||||||
|
|
||||||
if location_type == "S":
|
if location_type == "S":
|
||||||
is_remote = True
|
is_remote = True
|
||||||
|
@ -258,11 +258,19 @@ class GlassdoorScraper(Scraper):
|
||||||
page_num: int,
|
page_num: int,
|
||||||
cursor: str | None = None,
|
cursor: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
|
# `fromage` is the posting time filter in days
|
||||||
|
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
|
||||||
|
filter_params = []
|
||||||
|
if scraper_input.easy_apply:
|
||||||
|
filter_params.append({"filterKey": "applicationType", "values": "1"})
|
||||||
|
if fromage:
|
||||||
|
filter_params.append({"filterKey": "fromAge", "values": str(fromage)})
|
||||||
payload = {
|
payload = {
|
||||||
"operationName": "JobSearchResultsQuery",
|
"operationName": "JobSearchResultsQuery",
|
||||||
|
|
||||||
"variables": {
|
"variables": {
|
||||||
"excludeJobListingIds": [],
|
"excludeJobListingIds": [],
|
||||||
"filterParams": [{"filterKey": "applicationType", "values": "1"}] if scraper_input.easy_apply else [],
|
"filterParams": filter_params,
|
||||||
"keyword": scraper_input.search_term,
|
"keyword": scraper_input.search_term,
|
||||||
"numJobsToShow": 30,
|
"numJobsToShow": 30,
|
||||||
"locationType": location_type,
|
"locationType": location_type,
|
||||||
|
@ -270,6 +278,8 @@ class GlassdoorScraper(Scraper):
|
||||||
"parameterUrlInput": f"IL.0,12_I{location_type}{location_id}",
|
"parameterUrlInput": f"IL.0,12_I{location_type}{location_id}",
|
||||||
"pageNumber": page_num,
|
"pageNumber": page_num,
|
||||||
"pageCursor": cursor,
|
"pageCursor": cursor,
|
||||||
|
"fromage": fromage,
|
||||||
|
"sort": "date"
|
||||||
},
|
},
|
||||||
"query": "query JobSearchResultsQuery($excludeJobListingIds: [Long!], $keyword: String, $locationId: Int, $locationType: LocationTypeEnum, $numJobsToShow: Int!, $pageCursor: String, $pageNumber: Int, $filterParams: [FilterParams], $originalPageUrl: String, $seoFriendlyUrlInput: String, $parameterUrlInput: String, $seoUrl: Boolean) {\n jobListings(\n contextHolder: {searchParams: {excludeJobListingIds: $excludeJobListingIds, keyword: $keyword, locationId: $locationId, locationType: $locationType, numPerPage: $numJobsToShow, pageCursor: $pageCursor, pageNumber: $pageNumber, filterParams: $filterParams, originalPageUrl: $originalPageUrl, seoFriendlyUrlInput: $seoFriendlyUrlInput, parameterUrlInput: $parameterUrlInput, seoUrl: $seoUrl, searchType: SR}}\n ) {\n companyFilterOptions {\n id\n shortName\n __typename\n }\n filterOptions\n indeedCtk\n jobListings {\n ...JobView\n __typename\n }\n jobListingSeoLinks {\n linkItems {\n position\n url\n __typename\n }\n __typename\n }\n jobSearchTrackingKey\n jobsPageSeoData {\n pageMetaDescription\n pageTitle\n __typename\n }\n paginationCursors {\n cursor\n pageNumber\n __typename\n }\n indexablePageForSeo\n searchResultsMetadata {\n searchCriteria {\n implicitLocation {\n id\n localizedDisplayName\n type\n __typename\n }\n keyword\n location {\n id\n shortName\n localizedShortName\n localizedDisplayName\n type\n __typename\n }\n __typename\n }\n footerVO {\n countryMenu {\n childNavigationLinks {\n id\n link\n textKey\n __typename\n }\n __typename\n }\n __typename\n }\n helpCenterDomain\n helpCenterLocale\n jobAlert {\n jobAlertExists\n __typename\n }\n jobSerpFaq {\n questions {\n answer\n question\n __typename\n }\n __typename\n }\n jobSerpJobOutlook {\n occupation\n paragraph\n __typename\n }\n showMachineReadableJobs\n __typename\n }\n serpSeoLinksVO {\n relatedJobTitlesResults\n searchedJobTitle\n searchedKeyword\n searchedLocationIdAsString\n searchedLocationSeoName\n searchedLocationType\n topCityIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerNameResults\n topOccupationResults\n __typename\n }\n totalJobsCount\n __typename\n }\n}\n\nfragment JobView on JobListingSearchResult {\n jobview {\n header {\n adOrderId\n advertiserType\n adOrderSponsorshipLevel\n ageInDays\n divisionEmployerName\n easyApply\n employer {\n id\n name\n shortName\n __typename\n }\n employerNameFromSearch\n goc\n gocConfidence\n gocId\n jobCountryId\n jobLink\n jobResultTrackingKey\n jobTitleText\n locationName\n locationType\n locId\n needsCommission\n payCurrency\n payPeriod\n payPeriodAdjustedPay {\n p10\n p50\n p90\n __typename\n }\n rating\n salarySource\n savedJobId\n sponsored\n __typename\n }\n job {\n descriptionFragments\n importConfigId\n jobTitleId\n jobTitleText\n listingId\n __typename\n }\n jobListingAdminDetails {\n cpcVal\n importConfigId\n jobListingId\n jobSourceId\n userEligibleForAdminJobDetails\n __typename\n }\n overview {\n shortName\n squareLogoUrl\n __typename\n }\n __typename\n }\n __typename\n}\n",
|
"query": "query JobSearchResultsQuery($excludeJobListingIds: [Long!], $keyword: String, $locationId: Int, $locationType: LocationTypeEnum, $numJobsToShow: Int!, $pageCursor: String, $pageNumber: Int, $filterParams: [FilterParams], $originalPageUrl: String, $seoFriendlyUrlInput: String, $parameterUrlInput: String, $seoUrl: Boolean) {\n jobListings(\n contextHolder: {searchParams: {excludeJobListingIds: $excludeJobListingIds, keyword: $keyword, locationId: $locationId, locationType: $locationType, numPerPage: $numJobsToShow, pageCursor: $pageCursor, pageNumber: $pageNumber, filterParams: $filterParams, originalPageUrl: $originalPageUrl, seoFriendlyUrlInput: $seoFriendlyUrlInput, parameterUrlInput: $parameterUrlInput, seoUrl: $seoUrl, searchType: SR}}\n ) {\n companyFilterOptions {\n id\n shortName\n __typename\n }\n filterOptions\n indeedCtk\n jobListings {\n ...JobView\n __typename\n }\n jobListingSeoLinks {\n linkItems {\n position\n url\n __typename\n }\n __typename\n }\n jobSearchTrackingKey\n jobsPageSeoData {\n pageMetaDescription\n pageTitle\n __typename\n }\n paginationCursors {\n cursor\n pageNumber\n __typename\n }\n indexablePageForSeo\n searchResultsMetadata {\n searchCriteria {\n implicitLocation {\n id\n localizedDisplayName\n type\n __typename\n }\n keyword\n location {\n id\n shortName\n localizedShortName\n localizedDisplayName\n type\n __typename\n }\n __typename\n }\n footerVO {\n countryMenu {\n childNavigationLinks {\n id\n link\n textKey\n __typename\n }\n __typename\n }\n __typename\n }\n helpCenterDomain\n helpCenterLocale\n jobAlert {\n jobAlertExists\n __typename\n }\n jobSerpFaq {\n questions {\n answer\n question\n __typename\n }\n __typename\n }\n jobSerpJobOutlook {\n occupation\n paragraph\n __typename\n }\n showMachineReadableJobs\n __typename\n }\n serpSeoLinksVO {\n relatedJobTitlesResults\n searchedJobTitle\n searchedKeyword\n searchedLocationIdAsString\n searchedLocationSeoName\n searchedLocationType\n topCityIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerIdsToNameResults {\n key\n value\n __typename\n }\n topEmployerNameResults\n topOccupationResults\n __typename\n }\n totalJobsCount\n __typename\n }\n}\n\nfragment JobView on JobListingSearchResult {\n jobview {\n header {\n adOrderId\n advertiserType\n adOrderSponsorshipLevel\n ageInDays\n divisionEmployerName\n easyApply\n employer {\n id\n name\n shortName\n __typename\n }\n employerNameFromSearch\n goc\n gocConfidence\n gocId\n jobCountryId\n jobLink\n jobResultTrackingKey\n jobTitleText\n locationName\n locationType\n locId\n needsCommission\n payCurrency\n payPeriod\n payPeriodAdjustedPay {\n p10\n p50\n p90\n __typename\n }\n rating\n salarySource\n savedJobId\n sponsored\n __typename\n }\n job {\n descriptionFragments\n importConfigId\n jobTitleId\n jobTitleText\n listingId\n __typename\n }\n jobListingAdminDetails {\n cpcVal\n importConfigId\n jobListingId\n jobSourceId\n userEligibleForAdminJobDetails\n __typename\n }\n overview {\n shortName\n squareLogoUrl\n __typename\n }\n __typename\n }\n __typename\n}\n",
|
||||||
}
|
}
|
||||||
|
|
|
@ -363,12 +363,15 @@ class IndeedScraper(Scraper):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
||||||
|
# `fromage` is the posting time filter in days
|
||||||
|
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
|
||||||
params = {
|
params = {
|
||||||
"q": scraper_input.search_term,
|
"q": scraper_input.search_term,
|
||||||
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
|
"l": scraper_input.location if scraper_input.location else scraper_input.country.value[0].split(',')[-1],
|
||||||
"filter": 0,
|
"filter": 0,
|
||||||
"start": scraper_input.offset + page * 10,
|
"start": scraper_input.offset + page * 10,
|
||||||
"sort": "date"
|
"sort": "date",
|
||||||
|
"fromage": fromage,
|
||||||
}
|
}
|
||||||
if scraper_input.distance:
|
if scraper_input.distance:
|
||||||
params["radius"] = scraper_input.distance
|
params["radius"] = scraper_input.distance
|
||||||
|
@ -405,8 +408,7 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
return is_remote_in_attributes or is_remote_in_description or is_remote_in_location
|
||||||
|
|
||||||
@staticmethod
|
def get_job_details(self, job_keys: list[str]) -> dict:
|
||||||
def get_job_details(job_keys: list[str]) -> dict:
|
|
||||||
"""
|
"""
|
||||||
Queries the GraphQL endpoint for detailed job information for the given job keys.
|
Queries the GraphQL endpoint for detailed job information for the given job keys.
|
||||||
"""
|
"""
|
||||||
|
@ -478,7 +480,7 @@ class IndeedScraper(Scraper):
|
||||||
}}
|
}}
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
response = requests.post(url, headers=headers, json=payload)
|
response = requests.post(url, headers=headers, json=payload, proxies=self.proxy)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()['data']['jobData']['results']
|
return response.json()['data']['jobData']['results']
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -59,6 +59,12 @@ class LinkedInScraper(Scraper):
|
||||||
url_lock = Lock()
|
url_lock = Lock()
|
||||||
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
|
||||||
|
|
||||||
|
seconds_old = (
|
||||||
|
scraper_input.hours_old * 3600
|
||||||
|
if scraper_input.hours_old
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
def job_type_code(job_type_enum):
|
def job_type_code(job_type_enum):
|
||||||
mapping = {
|
mapping = {
|
||||||
JobType.FULL_TIME: "F",
|
JobType.FULL_TIME: "F",
|
||||||
|
@ -85,7 +91,8 @@ class LinkedInScraper(Scraper):
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
"start": page + scraper_input.offset,
|
"start": page + scraper_input.offset,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None
|
"f_C": ','.join(map(str, scraper_input.linkedin_company_ids)) if scraper_input.linkedin_company_ids else None,
|
||||||
|
"f_TPR": f"r{seconds_old}",
|
||||||
}
|
}
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
@ -101,7 +108,9 @@ class LinkedInScraper(Scraper):
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
raise LinkedInException(f"bad response status code: {e.response.status_code}")
|
raise LinkedInException(
|
||||||
|
f"bad response status code: {e.response.status_code}"
|
||||||
|
)
|
||||||
except ProxyError as e:
|
except ProxyError as e:
|
||||||
raise LinkedInException("bad proxy")
|
raise LinkedInException("bad proxy")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -145,11 +154,11 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
compensation = None
|
compensation = None
|
||||||
if salary_tag:
|
if salary_tag:
|
||||||
salary_text = salary_tag.get_text(separator=' ').strip()
|
salary_text = salary_tag.get_text(separator=" ").strip()
|
||||||
salary_values = [currency_parser(value) for value in salary_text.split('-')]
|
salary_values = [currency_parser(value) for value in salary_text.split("-")]
|
||||||
salary_min = salary_values[0]
|
salary_min = salary_values[0]
|
||||||
salary_max = salary_values[1]
|
salary_max = salary_values[1]
|
||||||
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
|
currency = salary_text[0] if salary_text[0] != "$" else "USD"
|
||||||
|
|
||||||
compensation = Compensation(
|
compensation = Compensation(
|
||||||
min_amount=int(salary_min),
|
min_amount=int(salary_min),
|
||||||
|
@ -294,17 +303,17 @@ class LinkedInScraper(Scraper):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def headers() -> dict:
|
def headers() -> dict:
|
||||||
return {
|
return {
|
||||||
'authority': 'www.linkedin.com',
|
"authority": "www.linkedin.com",
|
||||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||||||
'accept-language': 'en-US,en;q=0.9',
|
"accept-language": "en-US,en;q=0.9",
|
||||||
'cache-control': 'max-age=0',
|
"cache-control": "max-age=0",
|
||||||
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
"sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||||
# 'sec-ch-ua-mobile': '?0',
|
# 'sec-ch-ua-mobile': '?0',
|
||||||
# 'sec-ch-ua-platform': '"macOS"',
|
# 'sec-ch-ua-platform': '"macOS"',
|
||||||
# 'sec-fetch-dest': 'document',
|
# 'sec-fetch-dest': 'document',
|
||||||
# 'sec-fetch-mode': 'navigate',
|
# 'sec-fetch-mode': 'navigate',
|
||||||
# 'sec-fetch-site': 'none',
|
# 'sec-fetch-site': 'none',
|
||||||
# 'sec-fetch-user': '?1',
|
# 'sec-fetch-user': '?1',
|
||||||
'upgrade-insecure-requests': '1',
|
"upgrade-insecure-requests": "1",
|
||||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
}
|
}
|
||||||
|
|
|
@ -165,6 +165,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
"search": scraper_input.search_term,
|
"search": scraper_input.search_term,
|
||||||
"location": scraper_input.location,
|
"location": scraper_input.location,
|
||||||
}
|
}
|
||||||
|
if scraper_input.hours_old:
|
||||||
|
fromage = max(scraper_input.hours_old // 24, 1) if scraper_input.hours_old else None
|
||||||
|
params['days'] = fromage
|
||||||
job_type_value = None
|
job_type_value = None
|
||||||
if scraper_input.job_type:
|
if scraper_input.job_type:
|
||||||
if scraper_input.job_type.value == "fulltime":
|
if scraper_input.job_type.value == "fulltime":
|
||||||
|
|
Loading…
Reference in New Issue