chore: readme

pull/56/head
Cullen Watson 2023-10-10 11:20:24 -05:00
parent 4aa832d3e2
commit 35bdff65a9
12 changed files with 225 additions and 214 deletions

View File

@ -33,37 +33,19 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
```python ```python
from jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd
jobs: pd.DataFrame = scrape_jobs( jobs = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"], site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=10, results_wanted=10,
country_indeed='USA' # only needed for indeed country_indeed='USA' # only needed for indeed
# use if you want to use a proxy
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# offset=25 # use if you want to start at a specific offset
) )
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", index=False)
# formatting for pandas # output to .xlsx
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
# 1 output to console
print(jobs)
# 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)
# 3 output to .csv
# jobs.to_csv('jobs.csv', index=False)
# 4 output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False) # jobs.to_xlsx('jobs.xlsx', index=False)
``` ```
@ -117,6 +99,9 @@ JobPost
│ ├── max_amount (int) │ ├── max_amount (int)
│ └── currency (enum) │ └── currency (enum)
└── date_posted (date) └── date_posted (date)
└── emails (str)
└── num_urgent_words (int)
└── is_remote (bool) - just for Indeed at the momen
``` ```
### Exceptions ### Exceptions

View File

@ -6,23 +6,23 @@ jobs: pd.DataFrame = scrape_jobs(
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
country_indeed='USA', country_indeed="USA",
offset=25 # start jobs from an offset (use if search failed and want to continue) offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
) )
# formatting for pandas # formatting for pandas
pd.set_option('display.max_columns', None) pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None) pd.set_option("display.max_rows", None)
pd.set_option('display.width', None) pd.set_option("display.width", None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc pd.set_option("display.max_colwidth", 50) # set to 0 to see full job url / desc
# 1: output to console # 1: output to console
print(jobs) print(jobs)
# 2: output to .csv # 2: output to .csv
jobs.to_csv('./jobs.csv', index=False) jobs.to_csv("./jobs.csv", index=False)
print('outputted to jobs.csv') print("outputted to jobs.csv")
# 3: output to .xlsx # 3: output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False) # jobs.to_xlsx('jobs.xlsx', index=False)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.12" version = "1.1.13"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/cullenwatson/JobSpy"

View File

@ -37,7 +37,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: Optional[str] = None, proxy: Optional[str] = None,
offset: Optional[int] = 0 offset: Optional[int] = 0,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
@ -72,7 +72,7 @@ def scrape_jobs(
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
results_wanted=results_wanted, results_wanted=results_wanted,
offset=offset offset=offset,
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
@ -120,9 +120,14 @@ def scrape_jobs(
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>' ] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
job_data["job_type"] = ", ".join(job_type.value[0] for job_type in job_data["job_type"]) if job_data[ job_data["job_type"] = (
"job_type"] else None ", ".join(job_type.value[0] for job_type in job_data["job_type"])
job_data["emails"] = ", ".join(job_data["emails"]) if job_data["emails"] else None if job_data["job_type"]
else None
)
job_data["emails"] = (
", ".join(job_data["emails"]) if job_data["emails"] else None
)
job_data["location"] = Location(**job_data["location"]).display_location() job_data["location"] = Location(**job_data["location"]).display_location()
compensation_obj = job_data.get("compensation") compensation_obj = job_data.get("compensation")

View File

@ -189,7 +189,7 @@ class JobPost(BaseModel):
benefits: str | None = None benefits: str | None = None
emails: list[str] | None = None emails: list[str] | None = None
num_urgent_words: int | None = None num_urgent_words: int | None = None
# is_remote: bool | None = None is_remote: bool | None = None
# company_industry: str | None = None # company_industry: str | None = None

View File

@ -10,14 +10,13 @@ import io
import json import json
from datetime import datetime from datetime import datetime
import tls_client
import urllib.parse import urllib.parse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -54,7 +53,7 @@ class IndeedScraper(Scraper):
self.country = scraper_input.country self.country = scraper_input.country
domain = self.country.domain_value domain = self.country.domain_value
self.url = f"https://{domain}.indeed.com" self.url = f"https://{domain}.indeed.com"
session = self.create_session() session = create_session(self.proxy)
params = { params = {
"q": scraper_input.search_term, "q": scraper_input.search_term,
@ -155,8 +154,11 @@ class IndeedScraper(Scraper):
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url_client, job_url=job_url_client,
emails=extract_emails_from_text(description), emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) num_urgent_words=count_urgent_words(description)
if description
else None,
is_remote=self.is_remote_job(job),
) )
return job_post return job_post
@ -213,7 +215,7 @@ class IndeedScraper(Scraper):
params = urllib.parse.parse_qs(parsed_url.query) params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0] jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1" formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
session = self.create_session() session = create_session(self.proxy)
try: try:
response = session.get( response = session.get(
@ -250,7 +252,9 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label") label = taxonomy["attributes"][i].get("label")
if label: if label:
job_type_str = label.replace("-", "").replace(" ", "").lower() job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append(IndeedScraper.get_enum_from_job_type(job_type_str)) job_types.append(
IndeedScraper.get_enum_from_job_type(job_type_str)
)
return job_types return job_types
@staticmethod @staticmethod
@ -326,35 +330,26 @@ class IndeedScraper(Scraper):
@staticmethod @staticmethod
def get_headers(): def get_headers():
return { return {
'authority': 'www.indeed.com', "authority": "www.indeed.com",
'accept': '*/*', "accept": "*/*",
'accept-language': 'en-US,en;q=0.9', "accept-language": "en-US,en;q=0.9",
'referer': 'https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw', "referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw",
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', "sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0', "sec-ch-ua-mobile": "?0",
'sec-ch-ua-platform': '"Windows"', "sec-ch-ua-platform": '"Windows"',
'sec-fetch-dest': 'empty', "sec-fetch-dest": "empty",
'sec-fetch-mode': 'cors', "sec-fetch-mode": "cors",
'sec-fetch-site': 'same-origin', "sec-fetch-site": "same-origin",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36' "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
} }
def create_session(self): @staticmethod
def is_remote_job(job: dict) -> bool:
""" """
Creates a session with specific client identifiers and assigns proxies if available. :param job:
:return: bool
:return: A session object with or without proxies.
""" """
session = tls_client.Session( for taxonomy in job.get("taxonomyAttributes", []):
client_identifier="chrome112", if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
random_tls_extension_order=True, return True
) return False
session.proxies = self.proxy
# TODO multiple proxies
# if self.proxies:
# session.proxies = {
# "http": random.choice(self.proxies),
# "https": random.choice(self.proxies),
# }
return session

View File

@ -93,13 +93,15 @@ class LinkedInScraper(Scraper):
break break
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, 'response') and e.response is not None: if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429: if e.response.status_code == 429:
time.sleep(self.DELAY) time.sleep(self.DELAY)
retries += 1 retries += 1
continue continue
else: else:
raise LinkedInException(f"bad response status code: {e.response.status_code}") raise LinkedInException(
f"bad response status code: {e.response.status_code}"
)
else: else:
raise raise
except ProxyError as e: except ProxyError as e:
@ -108,7 +110,9 @@ class LinkedInScraper(Scraper):
raise LinkedInException(str(e)) raise LinkedInException(str(e))
else: else:
# Raise an exception if the maximum number of retries is reached # Raise an exception if the maximum number of retries is reached
raise LinkedInException("Max retries reached, failed to get a valid response") raise LinkedInException(
"Max retries reached, failed to get a valid response"
)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -135,7 +139,9 @@ class LinkedInScraper(Scraper):
if job_post: if job_post:
job_list.append(job_post) job_list.append(job_post)
except Exception as e: except Exception as e:
raise LinkedInException("Exception occurred while processing jobs") raise LinkedInException(
"Exception occurred while processing jobs"
)
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
@ -152,7 +158,11 @@ class LinkedInScraper(Scraper):
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card) location = self.get_location(metadata_card)
datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate")
if metadata_card
else None
)
date_posted = None date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs: if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
@ -172,14 +182,16 @@ class LinkedInScraper(Scraper):
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
# job_type=[JobType.FULL_TIME],
job_type=job_type, job_type=job_type,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description), emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) num_urgent_words=count_urgent_words(description) if description else None,
) )
def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[ def get_job_description(
str | None, tuple[str | None, JobType | None]]: self, job_page_url: str
) -> tuple[None, None] | tuple[str | None, tuple[str | None, JobType | None]]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
@ -233,7 +245,7 @@ class LinkedInScraper(Scraper):
def get_enum_from_value(value_str): def get_enum_from_value(value_str):
for job_type in JobType: for job_type in JobType:
if value_str in job_type.value: if value_str in job_type.value:
return list[job_type] return [job_type]
return None return None
def get_location(self, metadata_card: Optional[Tag]) -> Location: def get_location(self, metadata_card: Optional[Tag]) -> Location:

View File

@ -11,7 +11,6 @@ from datetime import datetime, date
from typing import Optional, Tuple, Any from typing import Optional, Tuple, Any
from urllib.parse import urlparse, parse_qs, urlunparse from urllib.parse import urlparse, parse_qs, urlunparse
import tls_client
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
@ -19,7 +18,7 @@ from concurrent.futures import ThreadPoolExecutor, Future
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import ZipRecruiterException from ..exceptions import ZipRecruiterException
from ..utils import count_urgent_words, extract_emails_from_text from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -42,9 +41,6 @@ class ZipRecruiterScraper(Scraper):
self.jobs_per_page = 20 self.jobs_per_page = 20
self.seen_urls = set() self.seen_urls = set()
self.session = tls_client.Session(
client_identifier="chrome112", random_tls_extension_order=True
)
def find_jobs_in_page( def find_jobs_in_page(
self, scraper_input: ScraperInput, page: int self, scraper_input: ScraperInput, page: int
@ -55,14 +51,13 @@ class ZipRecruiterScraper(Scraper):
:param page: :param page:
:return: jobs found on page :return: jobs found on page
""" """
job_list: list[JobPost] = [] session = create_session(self.proxy)
try: try:
response = self.session.get( response = session.get(
f"{self.url}/jobs-search", f"{self.url}/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=self.headers(),
params=ZipRecruiterScraper.add_params(scraper_input, page), params=self.add_params(scraper_input, page),
allow_redirects=True, allow_redirects=True,
proxy=self.proxy,
timeout_seconds=10, timeout_seconds=10,
) )
if response.status_code != 200: if response.status_code != 200:
@ -116,7 +111,11 @@ class ZipRecruiterScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
start_page = (scraper_input.offset // self.jobs_per_page) + 1 if scraper_input.offset else 1 start_page = (
(scraper_input.offset // self.jobs_per_page) + 1
if scraper_input.offset
else 1
)
#: get first page to initialize session #: get first page to initialize session
job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page) job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page)
pages_to_process = max( pages_to_process = max(
@ -137,92 +136,10 @@ class ZipRecruiterScraper(Scraper):
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job_html_1(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
TODO this method isnt finished due to not encountering this type of html often
"""
job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = None
if job_type_element:
job_type_text = (
job_type_element.text.strip().lower().replace("_", "").replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description)
)
return job_post
def process_job_html_2(self, job: Tag) -> Optional[JobPost]:
"""
Parses a job from the job content tag for a second variat of HTML that ZR uses
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = self.cleanurl(job.find("a", class_="job_link")["href"])
title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", class_="job_snippet").get_text().strip()
job_type_text = job.find("li", class_="perk_item perk_type")
job_type = None
if job_type_text:
job_type_text = (
job_type_text.get_text()
.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
)
return job_post
def process_job_javascript(self, job: dict) -> JobPost: def process_job_javascript(self, job: dict) -> JobPost:
"""the most common type of jobs page on ZR"""
title = job.get("Title") title = job.get("Title")
job_url = self.cleanurl(job.get("JobURL")) job_url = job.get("JobURL")
description, updated_job_url = self.get_description(job_url) description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url # job_url = updated_job_url if updated_job_url else job_url
@ -276,37 +193,126 @@ class ZipRecruiterScraper(Scraper):
return JobPost( return JobPost(
title=title, title=title,
description=description,
company_name=company, company_name=company,
location=location, location=location,
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
def process_job_html_2(self, job: Tag) -> Optional[JobPost]:
"""
second most common type of jobs page on ZR after process_job_javascript()
Parses a job from the job content tag for a second variat of HTML that ZR uses
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", class_="job_link")["href"]
title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
if description is None:
description = job.find("p", class_="job_snippet").get_text().strip()
job_type_text = job.find("li", class_="perk_item perk_type")
job_type = None
if job_type_text:
job_type_text = (
job_type_text.get_text()
.strip()
.lower()
.replace("-", "")
.replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
)
return job_post
def process_job_html_1(self, job: Tag) -> Optional[JobPost]:
"""
TODO this method isnt finished due to not encountering this type of html often
least common type of jobs page on ZR (rarely found)
Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post
:return JobPost
"""
job_url = job.find("a", {"class": "job_link"})["href"]
# job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls:
return None
title = job.find("h2", {"class": "title"}).text
company = job.find("a", {"class": "company_name"}).text.strip()
description, _ = self.get_description(job_url)
# job_url = updated_job_url if updated_job_url else job_url
# get description from jobs listing page if get_description from the specific job page fails
if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip()
job_type_element = job.find("li", {"class": "perk_item perk_type"})
job_type = None
if job_type_element:
job_type_text = (
job_type_element.text.strip().lower().replace("_", "").replace(" ", "")
)
job_type = ZipRecruiterScraper.get_job_type_enum(job_type_text)
date_posted = ZipRecruiterScraper.get_date_posted(job)
job_post = JobPost(
title=title,
description=description,
company_name=company,
location=ZipRecruiterScraper.get_location(job),
job_type=job_type,
compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted,
job_url=job_url,
emails=extract_emails_from_text(description),
num_urgent_words=count_urgent_words(description),
) )
return job_post return job_post
@staticmethod @staticmethod
def get_job_type_enum(job_type_str: str) -> Optional[list[JobType]]: def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
for job_type in JobType: for job_type in JobType:
if job_type_str in job_type.value: if job_type_str in job_type.value:
return [job_type] return [job_type]
return None return None
def get_description(self, job_page_url: str) -> Tuple[Optional[str], Optional[str]]: def get_description(self, job_page_url: str) -> Tuple[str | None, str | None]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
:param session:
:return: description or None, response url :return: description or None, response url
""" """
try: try:
response = requests.get( session = create_session(self.proxy)
response = session.get(
job_page_url, job_page_url,
headers=ZipRecruiterScraper.headers(), headers=self.headers(),
allow_redirects=True, allow_redirects=True,
timeout=5, timeout_seconds=5,
proxies=self.proxy,
) )
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None, None return None, None
@ -462,8 +468,8 @@ class ZipRecruiterScraper(Scraper):
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
} }
@staticmethod # @staticmethod
def cleanurl(url): # def cleanurl(url) -> str:
parsed_url = urlparse(url) # parsed_url = urlparse(url)
#
return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', '')) # return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', ''))

View File

@ -9,4 +9,6 @@ def test_all():
results_wanted=5, results_wanted=5,
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -7,4 +7,6 @@ def test_indeed():
site_name="indeed", site_name="indeed",
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -7,4 +7,6 @@ def test_linkedin():
site_name="linkedin", site_name="linkedin",
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"

View File

@ -8,4 +8,6 @@ def test_ziprecruiter():
search_term="software engineer", search_term="software engineer",
) )
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame" assert (
isinstance(result, pd.DataFrame) and not result.empty
), "Result should be a non-empty DataFrame"