Compare commits

...

3 Commits

Author SHA1 Message Date
Cullen Watson
5b3627b244 enh: full description param (#85) 2024-01-22 20:22:32 -06:00
Cullen Watson
2ec3b04777 fix(ziprecruiter): init cookies (#82) 2024-01-12 12:28:35 -06:00
Harish Vadaparty
89a5264391 add long scrape example (#81) 2024-01-12 12:24:00 -06:00
10 changed files with 213 additions and 62 deletions

View File

@@ -67,6 +67,7 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks] ├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
├── is_remote (bool) ├── is_remote (bool)
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn ├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)

View File

@@ -2,12 +2,11 @@ from jobspy import scrape_jobs
import pandas as pd import pandas as pd
jobs: pd.DataFrame = scrape_jobs( jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"], site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho)
country_indeed="USA", country_indeed="USA",
offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )

View File

@@ -0,0 +1,77 @@
from jobspy import scrape_jobs
import pandas as pd
import os
import time
# creates csv a new filename if the jobs.csv already exists.
csv_filename = "jobs.csv"
counter = 1
while os.path.exists(csv_filename):
csv_filename = f"jobs_{counter}.csv"
counter += 1
# results wanted and offset
results_wanted = 1000
offset = 0
all_jobs = []
# max retries
max_retries = 3
# nuumber of results at each iteration
results_in_each_iteration = 30
while len(all_jobs) < results_wanted:
retry_count = 0
while retry_count < max_retries:
print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
try:
jobs = scrape_jobs(
site_name=["indeed"],
search_term="software engineer",
# New York, NY
# Dallas, TX
# Los Angeles, CA
location="Los Angeles, CA",
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
country_indeed="USA",
offset=offset,
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)
# Add the scraped jobs to the list
all_jobs.extend(jobs.to_dict('records'))
# Increment the offset for the next page of results
offset += results_in_each_iteration
# Add a delay to avoid rate limiting (you can adjust the delay time as needed)
print(f"Scraped {len(all_jobs)} jobs")
print("Sleeping secs", 100 * (retry_count + 1))
time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests
break # Break out of the retry loop if successful
except Exception as e:
print(f"Error: {e}")
retry_count += 1
print("Sleeping secs before retry", 100 * (retry_count + 1))
time.sleep(100 * (retry_count + 1))
if retry_count >= max_retries:
print("Max retries reached. Exiting.")
break
# DataFrame from the collected job data
jobs_df = pd.DataFrame(all_jobs)
# Formatting
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", 50)
print(jobs_df)
jobs_df.to_csv(csv_filename, index=False)
print(f"Outputted to {csv_filename}")

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.33" version = "1.1.35"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -40,6 +40,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: Optional[str] = None, proxy: Optional[str] = None,
full_description: Optional[bool] = False,
offset: Optional[int] = 0, offset: Optional[int] = 0,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
@@ -74,6 +75,7 @@ def scrape_jobs(
is_remote=is_remote, is_remote=is_remote,
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
full_description=full_description,
results_wanted=results_wanted, results_wanted=results_wanted,
offset=offset, offset=offset,
) )

View File

@@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
is_remote: bool = False is_remote: bool = False
job_type: Optional[JobType] = None job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin easy_apply: bool = None # linkedin
full_description: bool = False
offset: int = 0 offset: int = 0
results_wanted: int = 15 results_wanted: int = 15

View File

@@ -5,8 +5,12 @@ jobspy.scrapers.glassdoor
This module contains routines to scrape Glassdoor. This module contains routines to scrape Glassdoor.
""" """
import json import json
from typing import Optional, Any import requests
from bs4 import BeautifulSoup
from typing import Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from ..utils import count_urgent_words, extract_emails_from_text
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import GlassdoorException from ..exceptions import GlassdoorException
@@ -66,14 +70,29 @@ class GlassdoorScraper(Scraper):
jobs_data = res_json["data"]["jobListings"]["jobListings"] jobs_data = res_json["data"]["jobListings"]["jobListings"]
jobs = [] jobs = []
for i, job in enumerate(jobs_data): with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][ future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
"linkItems" for future in as_completed(future_to_job_data):
][i]["url"] job_data = future_to_job_data[future]
try:
job_post = future.result()
if job_post:
jobs.append(job_post)
except Exception as exc:
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
)
def process_job(self, job_data):
"""Processes a single job and fetches its description."""
job_id = job_data["jobview"]["job"]["listingId"]
job_url = f'{self.url}/job-listing/?jl={job_id}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
continue return None
self.seen_urls.add(job_url) self.seen_urls.add(job_url)
job = job["jobview"] job = job_data["jobview"]
title = job["job"]["jobTitleText"] title = job["job"]["jobTitleText"]
company_name = job["header"]["employerNameFromSearch"] company_name = job["header"]["employerNameFromSearch"]
location_name = job["header"].get("locationName", "") location_name = job["header"].get("locationName", "")
@@ -89,20 +108,24 @@ class GlassdoorScraper(Scraper):
compensation = self.parse_compensation(job["header"]) compensation = self.parse_compensation(job["header"])
job = JobPost( try:
description = self.fetch_job_description(job_id)
except Exception as e :
description = None
job_post = JobPost(
title=title, title=title,
company_name=company_name, company_name=company_name,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
location=location, location=location,
compensation=compensation, compensation=compensation,
is_remote=is_remote is_remote=is_remote,
) description=description,
jobs.append(job) emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
return jobs, self.get_cursor_for_page(
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
) )
return job_post
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@@ -110,6 +133,7 @@ class GlassdoorScraper(Scraper):
:param scraper_input: Information about job search criteria. :param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs. :return: JobResponse containing a list of jobs.
""" """
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
self.country = scraper_input.country self.country = scraper_input.country
self.url = self.country.get_url() self.url = self.country.get_url()
@@ -143,6 +167,43 @@ class GlassdoorScraper(Scraper):
return JobResponse(jobs=all_jobs) return JobResponse(jobs=all_jobs)
def fetch_job_description(self, job_id):
"""Fetches the job description for a single job ID."""
url = f"{self.url}/graph"
body = [
{
"operationName": "JobDetailQuery",
"variables": {
"jl": job_id,
"queryString": "q",
"pageTypeEnum": "SERP"
},
"query": """
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
jobview: jobView(
listingId: $jl
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
) {
job {
description
__typename
}
__typename
}
}
"""
}
]
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
if response.status_code != 200:
return None
data = response.json()[0]
desc = data['data']['jobview']['job']['description']
soup = BeautifulSoup(desc, 'html.parser')
description = soup.get_text(separator='\n')
return description
@staticmethod @staticmethod
def parse_compensation(data: dict) -> Optional[Compensation]: def parse_compensation(data: dict) -> Optional[Compensation]:
pay_period = data.get("payPeriod") pay_period = data.get("payPeriod")

View File

@@ -78,7 +78,7 @@ class IndeedScraper(Scraper):
if sc_values: if sc_values:
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
try: try:
session = create_session(self.proxy, is_tls=True) session = create_session(self.proxy)
response = session.get( response = session.get(
f"{self.url}/jobs", f"{self.url}/jobs",
headers=self.get_headers(), headers=self.get_headers(),
@@ -140,7 +140,8 @@ class IndeedScraper(Scraper):
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d") date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url) description = self.get_description(job_url) if scraper_input.full_description else None
with io.StringIO(job["snippet"]) as f: with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser") soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li") li_elements = soup_io.find_all("li")
@@ -246,7 +247,7 @@ class IndeedScraper(Scraper):
return None return None
soup = BeautifulSoup(job_description, "html.parser") soup = BeautifulSoup(job_description, "html.parser")
text_content = " ".join(soup.get_text(separator=" ").split()).strip() text_content = "\n".join(soup.stripped_strings)
return text_content return text_content

View File

@@ -111,7 +111,7 @@ class LinkedInScraper(Scraper):
# Call process_job directly without threading # Call process_job directly without threading
try: try:
job_post = self.process_job(job_card, job_url) job_post = self.process_job(job_card, job_url, scraper_input.full_description)
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
except Exception as e: except Exception as e:
@@ -123,7 +123,7 @@ class LinkedInScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info') salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None compensation = None
@@ -160,7 +160,7 @@ class LinkedInScraper(Scraper):
if metadata_card if metadata_card
else None else None
) )
date_posted = None date_posted = description = job_type = None
if datetime_tag and "datetime" in datetime_tag.attrs: if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
try: try:
@@ -169,9 +169,8 @@ class LinkedInScraper(Scraper):
date_posted = None date_posted = None
benefits_tag = job_card.find("span", class_="result-benefits__text") benefits_tag = job_card.find("span", class_="result-benefits__text")
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
if full_descr:
# removed to speed up scraping description, job_type = self.get_job_description(job_url)
# description, job_type = self.get_job_description(job_url)
return JobPost( return JobPost(
title=title, title=title,
@@ -182,10 +181,10 @@ class LinkedInScraper(Scraper):
job_url=job_url, job_url=job_url,
compensation=compensation, compensation=compensation,
benefits=benefits, benefits=benefits,
# job_type=job_type, job_type=job_type,
# description=description, description=description,
# emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
# num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description( def get_job_description(
@@ -214,7 +213,7 @@ class LinkedInScraper(Scraper):
description = None description = None
if div_content: if div_content:
description = " ".join(div_content.get_text().split()).strip() description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())
def get_job_type( def get_job_type(
soup_job_type: BeautifulSoup, soup_job_type: BeautifulSoup,

View File

@@ -10,6 +10,7 @@ import re
from datetime import datetime, date from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@@ -26,6 +27,8 @@ class ZipRecruiterScraper(Scraper):
""" """
site = Site(Site.ZIP_RECRUITER) site = Site(Site.ZIP_RECRUITER)
self.url = "https://www.ziprecruiter.com" self.url = "https://www.ziprecruiter.com"
self.session = create_session(proxy)
self.get_cookies()
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
self.jobs_per_page = 20 self.jobs_per_page = 20
@@ -44,12 +47,10 @@ class ZipRecruiterScraper(Scraper):
if continue_token: if continue_token:
params["continue"] = continue_token params["continue"] = continue_token
try: try:
session = create_session(self.proxy, is_tls=True) response = self.session.get(
response = session.get(
f"https://api.ziprecruiter.com/jobs-app/jobs", f"https://api.ziprecruiter.com/jobs-app/jobs",
headers=self.headers(), headers=self.headers(),
params=self.add_params(scraper_input), params=self.add_params(scraper_input),
timeout_seconds=10,
) )
if response.status_code != 200: if response.status_code != 200:
raise ZipRecruiterException( raise ZipRecruiterException(
@@ -108,7 +109,7 @@ class ZipRecruiterScraper(Scraper):
description = BeautifulSoup( description = BeautifulSoup(
job.get("job_description", "").strip(), "html.parser" job.get("job_description", "").strip(), "html.parser"
).get_text() ).get_text(separator="\n")
company = job["hiring_company"].get("name") if "hiring_company" in job else None company = job["hiring_company"].get("name") if "hiring_company" in job else None
country_value = "usa" if job.get("job_country") == "US" else "canada" country_value = "usa" if job.get("job_country") == "US" else "canada"
@@ -156,6 +157,11 @@ class ZipRecruiterScraper(Scraper):
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_cookies(self):
url="https://api.ziprecruiter.com/jobs-app/event"
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
self.session.post(url, data=data, headers=ZipRecruiterScraper.headers())
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> list[JobType] | None: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
@@ -195,12 +201,16 @@ class ZipRecruiterScraper(Scraper):
@staticmethod @staticmethod
def headers() -> dict: def headers() -> dict:
""" """
Returns headers needed for ZipRecruiter API requests Returns headers needed for requests
:return: dict - Dictionary containing headers :return: dict - Dictionary containing headers
""" """
return { return {
'Host': 'api.ziprecruiter.com', "Host": "api.ziprecruiter.com",
'accept': '*/*', "accept": "*/*",
'authorization': 'Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==', "x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
'Cookie': '__cf_bm=DZ7eJOw6lka.Bwy5jLeDqWanaZ8BJlVAwaXrmcbYnxM-1701505132-0-AfGaVIfTA2kJlmleK14o722vbVwpZ+4UxFznsWv+guvzXSpD9KVEy/+pNzvEZUx88yaEShJwGt3/EVjhHirX/ASustKxg47V/aXRd2XIO2QN; zglobalid=61f94830-1990-4130-b222-d9d0e09c7825.57da9ea9581c.656ae86b; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; zva=100000000%3Bvid%3AZWroa0x_F1KEeGeU' "x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
"x-deviceid": "D77B3A92-E589-46A4-8A39-6EF6F1D86006",
"user-agent": "Job Search/87.0 (iPhone; CPU iOS 16_6_1 like Mac OS X)",
"authorization": "Basic YTBlZjMyZDYtN2I0Yy00MWVkLWEyODMtYTI1NDAzMzI0YTcyOg==",
"accept-language": "en-US,en;q=0.9",
} }