Multiple job types for Indeed, urgent keywords column (#56)

* enh(indeed): mult job types

* feat(jobs):  urgent kws

* fix(indeed): use new session obj per request

* fix: emails as comma separated in output

* fix: put num urgent words in output

* chore: readme
pull/58/head
Cullen Watson 2023-10-10 11:23:04 -05:00 committed by GitHub
parent 628f4dee9c
commit e5353e604d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 271 additions and 247 deletions

View File

@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
```python ```python
from jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd
jobs: pd.DataFrame = scrape_jobs( jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"], site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=10, results_wanted=10,
country_indeed='USA' # only needed for indeed country_indeed='USA' # only needed for indeed
# use if you want to use a proxy
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# offset=25 # use if you want to start at a specific offset
) )
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", index=False)
# formatting for pandas # output to .xlsx
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
# 1 output to console
print(jobs)
# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)
# 3 output to .csv
# jobs.to_csv('jobs.csv', index=False)
# 4 output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False) # jobs.to_xlsx('jobs.xlsx', index=False)
``` ```
@ -117,6 +99,9 @@ JobPost
│ ├── max_amount (int) │ ├── max_amount (int)
│ └── currency (enum) │ └── currency (enum)
└── date_posted (date) └── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool) - just for Indeed at the momen
``` ```
### Exceptions ### Exceptions

View File

@ -6,23 +6,23 @@ jobs: pd.DataFrame = scrape_jobs(
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
country_indeed='USA', country_indeed="USA",
offset=25 # start jobs from an offset (use if search failed and want to continue) offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )
# formatting for pandas # formatting for pandas
pd.set_option('display.max_columns', None) pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None) pd.set_option("display.max_rows", None)
pd.set_option('display.width', None) pd.set_option("display.width", None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc
# 1: output to console # 1: output to console
print(jobs) print(jobs)
# 2: output to .csv # 2: output to .csv
jobs.to_csv('./jobs.csv', index=False) jobs.to_csv("./jobs.csv", index=False)
print('outputted to jobs.csv') print("outputted to jobs.csv")
# 3: output to .xlsx # 3: output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False) # jobs.to_xlsx('jobs.xlsx', index=False)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.12" version = "1.1.13"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/cullenwatson/JobSpy"

View File

@ -1,7 +1,7 @@
import pandas as pd import pandas as pd
import concurrent.futures import concurrent.futures
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, Optional from typing import Tuple, Optional
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@ -26,7 +26,7 @@ def _map_str_to_site(site_name: str) -> Site:
def scrape_jobs( def scrape_jobs(
site_name: str | List[str] | Site | List[Site], site_name: str | list[str] | Site | list[Site],
search_term: str, search_term: str,
location: str = "", location: str = "",
distance: int = None, distance: int = None,
@ -37,7 +37,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: Optional[str] = None, proxy: Optional[str] = None,
offset: Optional[int] = 0 offset: Optional[int] = 0,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
@ -72,7 +72,7 @@ def scrape_jobs(
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
results_wanted=results_wanted, results_wanted=results_wanted,
offset=offset offset=offset,
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
@ -98,8 +98,8 @@ def scrape_jobs(
site_to_jobs_dict = {} site_to_jobs_dict = {}
def worker(site): def worker(site):
site_value, scraped_data = scrape_site(site) site_val, scraped_info = scrape_site(site)
return site_value, scraped_data return site_val, scraped_info
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
future_to_site = { future_to_site = {
@ -110,7 +110,7 @@ def scrape_jobs(
site_value, scraped_data = future.result() site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data site_to_jobs_dict[site_value] = scraped_data
jobs_dfs: List[pd.DataFrame] = [] jobs_dfs: list[pd.DataFrame] = []
for site, job_response in site_to_jobs_dict.items(): for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs: for job in job_response.jobs:
@ -120,12 +120,14 @@ def scrape_jobs(
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>' ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
if job_data["job_type"]: job_data["job_type"] = (
# Take the first value from the job type tuple ", ".join(job_type.value[0] for job_type in job_data["job_type"])
job_data["job_type"] = job_data["job_type"].value[0] if job_data["job_type"]
else: else None
job_data["job_type"] = None )
job_data["emails"] = (
", ".join(job_data["emails"]) if job_data["emails"] else None
)
job_data["location"] = Location(**job_data["location"]).display_location() job_data["location"] = Location(**job_data["location"]).display_location()
compensation_obj = job_data.get("compensation") compensation_obj = job_data.get("compensation")
@ -149,7 +151,7 @@ def scrape_jobs(
if jobs_dfs: if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True) jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: List[str] = [ desired_order: list[str] = [
"job_url_hyper" if hyperlinks else "job_url", "job_url_hyper" if hyperlinks else "job_url",
"site", "site",
"title", "title",
@ -158,12 +160,13 @@ def scrape_jobs(
"job_type", "job_type",
"date_posted", "date_posted",
"interval", "interval",
"benefits",
"min_amount", "min_amount",
"max_amount", "max_amount",
"currency", "currency",
"is_remote",
"num_urgent_words",
"benefits",
"emails", "emails",
"job_url_hyper" if hyperlinks else "job_url",
"description", "description",
] ]
jobs_formatted_df = jobs_df[desired_order] jobs_formatted_df = jobs_df[desired_order]

View File

@ -182,12 +182,15 @@ class JobPost(BaseModel):
job_url: str job_url: str
location: Optional[Location] location: Optional[Location]
description: Optional[str] = None description: str | None = None
job_type: Optional[JobType] = None job_type: list[JobType] | None = None
compensation: Optional[Compensation] = None compensation: Compensation | None = None
date_posted: Optional[date] = None date_posted: date | None = None
benefits: Optional[str] = None benefits: str | None = None
emails: Optional[list[str]] = None emails: list[str] | None = None
num_urgent_words: int | None = None
is_remote: bool | None = None
# company_industry: str | None = None
class JobResponse(BaseModel): class JobResponse(BaseModel):

View File

@ -9,15 +9,14 @@ import math
import io import io
import json import json
from datetime import datetime from datetime import datetime
from typing import Optional
import tls_client
import urllib.parse import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -28,15 +27,9 @@ from ...jobs import (
) )
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: str | None = None):
""" """
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
@ -49,20 +42,18 @@ class IndeedScraper(Scraper):
self.seen_urls = set() self.seen_urls = set()
def scrape_page( def scrape_page(
self, scraper_input: ScraperInput, page: int, session: tls_client.Session self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int]: ) -> tuple[list[JobPost], int]:
""" """
Scrapes a page of Indeed for jobs with scraper_input criteria Scrapes a page of Indeed for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param page: :param page:
:param session:
:return: jobs found on page, total number of jobs found for search :return: jobs found on page, total number of jobs found for search
""" """
self.country = scraper_input.country self.country = scraper_input.country
domain = self.country.domain_value domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com" self.url = f"https://{domain}.indeed.com"
session = create_session(self.proxy)
job_list: list[JobPost] = []
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
@ -84,9 +75,9 @@ class IndeedScraper(Scraper):
try: try:
response = session.get( response = session.get(
f"{self.url}/jobs", f"{self.url}/jobs",
headers=self.get_headers(),
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxy=self.proxy,
timeout_seconds=10, timeout_seconds=10,
) )
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
@ -114,7 +105,7 @@ class IndeedScraper(Scraper):
): ):
raise IndeedException("No jobs found.") raise IndeedException("No jobs found.")
def process_job(job) -> Optional[JobPost]: def process_job(job) -> JobPost | None:
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}' job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}' job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
if job_url in self.seen_urls: if job_url in self.seen_urls:
@ -143,8 +134,7 @@ class IndeedScraper(Scraper):
date_posted = datetime.fromtimestamp(timestamp_seconds) date_posted = datetime.fromtimestamp(timestamp_seconds)
date_posted = date_posted.strftime("%Y-%m-%d") date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session) description = self.get_description(job_url)
emails = extract_emails_from_text(description)
with io.StringIO(job["snippet"]) as f: with io.StringIO(job["snippet"]) as f:
soup_io = BeautifulSoup(f, "html.parser") soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup_io.find_all("li") li_elements = soup_io.find_all("li")
@ -160,11 +150,15 @@ class IndeedScraper(Scraper):
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
country=self.country, country=self.country,
), ),
emails=extract_emails_from_text(description),
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url_client, job_url=job_url_client,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description)
if description
else None,
is_remote=self.is_remote_job(job),
) )
return job_post return job_post
@ -184,20 +178,16 @@ class IndeedScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
pages_to_process = ( pages_to_process = (
math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1 math.ceil(scraper_input.results_wanted / self.jobs_per_page) - 1
) )
#: get first page to initialize session #: get first page to initialize session
job_list, total_results = self.scrape_page(scraper_input, 0, session) job_list, total_results = self.scrape_page(scraper_input, 0)
with ThreadPoolExecutor(max_workers=1) as executor: with ThreadPoolExecutor(max_workers=1) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self.scrape_page, scraper_input, page, session) executor.submit(self.scrape_page, scraper_input, page)
for page in range(1, pages_to_process + 1) for page in range(1, pages_to_process + 1)
] ]
@ -215,21 +205,24 @@ class IndeedScraper(Scraper):
) )
return job_response return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]: def get_description(self, job_page_url: str) -> str | None:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
:param session:
:return: description :return: description
""" """
parsed_url = urllib.parse.urlparse(job_page_url) parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query) params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0] jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
session = create_session(self.proxy)
try: try:
response = session.get( response = session.get(
formatted_url, allow_redirects=True, timeout_seconds=5, proxy=self.proxy formatted_url,
headers=self.get_headers(),
allow_redirects=True,
timeout_seconds=5,
) )
except Exception as e: except Exception as e:
return None return None
@ -246,20 +239,23 @@ class IndeedScraper(Scraper):
return text_content return text_content
@staticmethod @staticmethod
def get_job_type(job: dict) -> Optional[JobType]: def get_job_type(job: dict) -> list[JobType] | None:
""" """
Parses the job to get JobTypeIndeed Parses the job to get list of job types
:param job: :param job:
:return: :return:
""" """
job_types: list[JobType] = []
for taxonomy in job["taxonomyAttributes"]: for taxonomy in job["taxonomyAttributes"]:
if taxonomy["label"] == "job-types": if taxonomy["label"] == "job-types":
if len(taxonomy["attributes"]) > 0: for i in range(len(taxonomy["attributes"])):
label = taxonomy["attributes"][0].get("label") label = taxonomy["attributes"][i].get("label")
if label: if label:
job_type_str = label.replace("-", "").replace(" ", "").lower() job_type_str = label.replace("-", "").replace(" ", "").lower()
return IndeedScraper.get_enum_from_job_type(job_type_str) job_types.append(
return None IndeedScraper.get_enum_from_job_type(job_type_str)
)
return job_types
@staticmethod @staticmethod
def get_enum_from_job_type(job_type_str): def get_enum_from_job_type(job_type_str):
@ -280,7 +276,7 @@ class IndeedScraper(Scraper):
:return: jobs :return: jobs
""" """
def find_mosaic_script() -> Optional[Tag]: def find_mosaic_script() -> Tag | None:
""" """
Finds jobcards script tag Finds jobcards script tag
:return: script_tag :return: script_tag
@ -330,3 +326,30 @@ class IndeedScraper(Scraper):
data = json.loads(json_str) data = json.loads(json_str)
total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"]) total_num_jobs = int(data["searchTitleBarModel"]["totalNumResults"])
return total_num_jobs return total_num_jobs
@staticmethod
def get_headers():
return {
"authority": "www.indeed.com",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9",
"referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw",
"sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
}
@staticmethod
def is_remote_job(job: dict) -> bool:
"""
:param job:
:return: bool
"""
for taxonomy in job.get("taxonomyAttributes", []):
if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
return True
return False

View File

@ -17,6 +17,7 @@ from bs4.element import Tag
from threading import Lock from threading import Lock
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -26,13 +27,6 @@ from ...jobs import (
) )
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
MAX_RETRIES = 3 MAX_RETRIES = 3
DELAY = 10 DELAY = 10
@ -99,13 +93,15 @@ class LinkedInScraper(Scraper):
break break
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, 'response') and e.response is not None: if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429: if e.response.status_code == 429:
time.sleep(self.DELAY) time.sleep(self.DELAY)
retries += 1 retries += 1
continue continue
else: else:
raise LinkedInException(f"bad response status code: {e.response.status_code}") raise LinkedInException(
f"bad response status code: {e.response.status_code}"
)
else: else:
raise raise
except ProxyError as e: except ProxyError as e:
@ -114,7 +110,9 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e)) raise LinkedInException(str(e))
else: else:
# Raise an exception if the maximum number of retries is reached # Raise an exception if the maximum number of retries is reached
raise LinkedInException("Max retries reached, failed to get a valid response") raise LinkedInException(
"Max retries reached, failed to get a valid response"
)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -141,7 +139,9 @@ class LinkedInScraper(Scraper):
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
except Exception as e: except Exception as e:
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException(
"Exception occurred while processing jobs"
)
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
@ -158,7 +158,11 @@ class LinkedInScraper(Scraper):
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card) location = self.get_location(metadata_card)
datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate")
if metadata_card
else None
)
date_posted = None date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs: if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
@ -178,13 +182,16 @@ class LinkedInScraper(Scraper):
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
# job_type=[JobType.FULL_TIME],
job_type=job_type, job_type=job_type,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description) emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[ def get_job_description(
str | None, tuple[str | None, JobType | None]]: self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
@ -207,7 +214,7 @@ class LinkedInScraper(Scraper):
def get_job_type( def get_job_type(
soup_job_type: BeautifulSoup, soup_job_type: BeautifulSoup,
) -> JobType | None: ) -> list[JobType] | None:
""" """
Gets the job type from job page Gets the job type from job page
:param soup_job_type: :param soup_job_type:
@ -238,7 +245,7 @@ class LinkedInScraper(Scraper):
def get_enum_from_value(value_str): def get_enum_from_value(value_str):
for job_type in JobType: for job_type in JobType:
if value_str in job_type.value: if value_str in job_type.value:
return job_type return [job_type]
return None return None
def get_location(self, metadata_card: Optional[Tag]) -> Location: def get_location(self, metadata_card: Optional[Tag]) -> Location:
@ -263,9 +270,3 @@ class LinkedInScraper(Scraper):
) )
return location return location
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)

View File

@ -11,7 +11,6 @@ from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse from urllib.parse import urlparse, parse_qs, urlunparse
import tls_client
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
@ -19,6 +18,7 @@ from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -29,12 +29,6 @@ from ...jobs import (
Country, Country,
) )
def extract_emails_from_text(text: str) -> Optional[list[str]]:
if not text:
return None
email_regex = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
return email_regex.findall(text)
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
@ -47,9 +41,6 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
def find_jobs_in_page( def find_jobs_in_page(
self, scraper_input: ScraperInput, page: int self, scraper_input: ScraperInput, page: int
@ -60,14 +51,13 @@ class ZipRecruiterScraper(Scraper):
:param page: :param page:
:return: jobs found on page :return: jobs found on page
""" """
job_list: list[JobPost] = [] session = create_session(self.proxy)
try: try:
response = self.session.get( response = session.get(
f"{self.url}/jobs-search", f"{self.url}/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=self.headers(),
params=ZipRecruiterScraper.add_params(scraper_input, page), params=self.add_params(scraper_input, page),
allow_redirects=True, allow_redirects=True,
proxy=self.proxy,
timeout_seconds=10, timeout_seconds=10,
) )
if response.status_code != 200: if response.status_code != 200:
@ -121,7 +111,11 @@ class ZipRecruiterScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
start_page = (scraper_input.offset // self.jobs_per_page) + 1 if scraper_input.offset else 1 start_page = (
(scraper_input.offset // self.jobs_per_page) + 1
if scraper_input.offset
else 1
)
#: get first page to initialize session #: get first page to initialize session
job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page) job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page)
pages_to_process = max( pages_to_process = max(
@ -142,91 +136,10 @@ class ZipRecruiterScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job_html_1(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
TODO this method isnt finished due to not encountering this type of html often
"""
job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = None
if job_type_element:
job_type_text = (
job_type_element.text.strip().lower().replace("_", "").replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
)
return job_post
def process_job_html_2(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag for a second variat of HTML that ZR uses
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = self.cleanurl(job.find("a", class_="job_link")["href"])
title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", class_="job_snippet").get_text().strip()
job_type_text = job.find("li", class_="perk_item perk_type")
job_type = None
if job_type_text:
job_type_text = (
job_type_text.get_text()
.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
)
return job_post
def process_job_javascript(self, job: dict) -> JobPost: def process_job_javascript(self, job: dict) -> JobPost:
"""the most common type of jobs page on ZR"""
title = job.get("Title") title = job.get("Title")
job_url = self.cleanurl(job.get("JobURL")) job_url = job.get("JobURL")
description, updated_job_url = self.get_description(job_url) description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url # job_url = updated_job_url if updated_job_url else job_url
@ -280,38 +193,126 @@ class ZipRecruiterScraper(Scraper):
return JobPost( return JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
location=location, location=location,
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def process_job_html_2(self, job: Tag) -> Optional[JobPost]:
"""
second most common type of jobs page on ZR after process_job_javascript()
Parses a job from the job content tag for a second variat of HTML that ZR uses
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", class_="job_link")["href"]
title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", class_="job_snippet").get_text().strip()
job_type_text = job.find("li", class_="perk_item perk_type")
job_type = None
if job_type_text:
job_type_text = (
job_type_text.get_text()
.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
return job_post
def process_job_html_1(self, job: Tag) -> Optional[JobPost]:
"""
TODO this method isnt finished due to not encountering this type of html often
least common type of jobs page on ZR (rarely found)
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
# job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, _ = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
# get description from jobs listing page if get_description from the specific job page fails
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = None
if job_type_element:
job_type_text = (
job_type_element.text.strip().lower().replace("_", "").replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description),
) )
return job_post return job_post
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> Optional[JobType]: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
if job_type_str in job_type.value: if job_type_str in job_type.value:
a = True return [job_type]
return job_type
return None return None
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: def get_description(self, job_page_url: str) -> Tuple[str | None, str | None]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
:param session:
:return: description or None, response url :return: description or None, response url
""" """
try: try:
response = requests.get( session = create_session(self.proxy)
response = session.get(
job_page_url, job_page_url,
headers=ZipRecruiterScraper.headers(), headers=self.headers(),
allow_redirects=True, allow_redirects=True,
timeout=5, timeout_seconds=5,
proxies=self.proxy,
) )
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None, None return None, None
@ -467,8 +468,8 @@ class ZipRecruiterScraper(Scraper):
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
} }
@staticmethod # @staticmethod
def cleanurl(url): # def cleanurl(url) -> str:
parsed_url = urlparse(url) # parsed_url = urlparse(url)
#
return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) # return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', ''))

View File

@ -9,4 +9,6 @@ def test_all():
results_wanted=5, results_wanted=5,
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -7,4 +7,6 @@ def test_indeed():
site_name="indeed", site_name="indeed",
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -7,4 +7,6 @@ def test_linkedin():
site_name="linkedin", site_name="linkedin",
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -8,4 +8,6 @@ def test_ziprecruiter():
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"