add offset param & email extraction (#51)

* add offset param

* [enh]: extract emails
pull/54/head v1.1.8
Cullen Watson 2023-09-28 18:11:28 -05:00 committed by GitHub
parent 286b9e1256
commit af07c1ecbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 1209 additions and 1126 deletions

8
.gitignore vendored
View File

@ -1,10 +1,10 @@
/.idea
**/.DS_Store
/venv/ /venv/
/ven/ /.idea
**/__pycache__/ **/__pycache__/
**/.pytest_cache/ **/.pytest_cache/
/.ipynb_checkpoints/
**/output/
**/.DS_Store
*.pyc *.pyc
.env .env
dist dist
/.ipynb_checkpoints/

View File

@ -4,21 +4,25 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com). **Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to work with us.* *Looking to build a data-focused software product?* **[Book a call](https://calendly.com/zachary-products/15min)** *to
work with us.*
\ \
Check out another project we wrote: ***[HomeHarvest](https://github.com/ZacharyHampton/HomeHarvest)** a Python package for real estate scraping* Check out another project we wrote: ***[HomeHarvest](https://github.com/ZacharyHampton/HomeHarvest)** a Python package
## Features for real estate scraping*
## Features
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
- Proxy support (HTTP/S, SOCKS) - Proxy support (HTTP/S, SOCKS)
[Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) - Updated for release v1.1.3 [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
Updated for release v1.1.3
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57) ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
### Installation ### Installation
``` ```
pip install --upgrade python-jobspy pip install --upgrade python-jobspy
``` ```
@ -39,10 +43,9 @@ jobs: pd.DataFrame = scrape_jobs(
country_indeed='USA' # only needed for indeed country_indeed='USA' # only needed for indeed
# use if you want to use a proxy (3 types) # use if you want to use a proxy
# proxy="socks5://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
# proxy="https://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001", # offset=25 # use if you want to start at a specific offset
) )
# formatting for pandas # formatting for pandas
@ -51,17 +54,22 @@ pd.set_option('display.max_rows', None)
pd.set_option('display.width', None) pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
#1 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook) # 1 output to console
display(jobs) print(jobs)
#2 output to console # 2 display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
#print(jobs) # display(jobs)
# 3 output to .csv # 3 output to .csv
# jobs.to_csv('jobs.csv', index=False) # jobs.to_csv('jobs.csv', index=False)
# 4 output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False)
``` ```
### Output ### Output
``` ```
SITE TITLE COMPANY_NAME CITY STATE JOB_TYPE INTERVAL MIN_AMOUNT MAX_AMOUNT JOB_URL DESCRIPTION SITE TITLE COMPANY_NAME CITY STATE JOB_TYPE INTERVAL MIN_AMOUNT MAX_AMOUNT JOB_URL DESCRIPTION
indeed Software Engineer AMERICAN SYSTEMS Arlington VA None yearly 200000 150000 https://www.indeed.com/viewjob?jk=5e409e577046... THIS POSITION COMES WITH A 10K SIGNING BONUS!... indeed Software Engineer AMERICAN SYSTEMS Arlington VA None yearly 200000 150000 https://www.indeed.com/viewjob?jk=5e409e577046... THIS POSITION COMES WITH A 10K SIGNING BONUS!...
@ -71,7 +79,9 @@ linkedin Full-Stack Software Engineer Rain New York
zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba... zip_recruiter Software Engineer - New Grad ZipRecruiter Santa Monica CA fulltime yearly 130000 150000 https://www.ziprecruiter.com/jobs/ziprecruiter... We offer a hybrid work environment. Most US-ba...
zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme... zip_recruiter Software Developer TEKsystems Phoenix AZ fulltime hourly 65 75 https://www.ziprecruiter.com/jobs/teksystems-0... Top Skills' Details• 6 years of Java developme...
``` ```
### Parameters for `scrape_jobs()` ### Parameters for `scrape_jobs()`
```plaintext ```plaintext
Required Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed ├── site_type (List[enum]): linkedin, zip_recruiter, indeed
@ -85,10 +95,11 @@ Optional
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn ├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling) ├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
├── offset (enum): starts the search from an offset (e.g. 25 will start the search from the 25th result)
``` ```
### JobPost Schema ### JobPost Schema
```plaintext ```plaintext
JobPost JobPost
├── title (str) ├── title (str)
@ -109,14 +120,15 @@ JobPost
``` ```
### Exceptions ### Exceptions
The following exceptions may be raised when using JobSpy: The following exceptions may be raised when using JobSpy:
* `LinkedInException` * `LinkedInException`
* `IndeedException` * `IndeedException`
* `ZipRecruiterException` * `ZipRecruiterException`
## Supported Countries for Job Searching ## Supported Countries for Job Searching
### **LinkedIn** ### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter. LinkedIn searches globally & uses only the `location` parameter.
@ -125,15 +137,15 @@ LinkedIn searches globally & uses only the `location` parameter.
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter. ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed** ### **Indeed**
Indeed supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` parameter to narrow down the location, e.g. city & state if necessary.
Indeed supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name): You can specify the following countries when searching on Indeed (use the exact name):
| | | | | | | | | |
|------|------|------|------| |----------------------|--------------|------------|----------------|
| Argentina | Australia | Austria | Bahrain | | Argentina | Australia | Austria | Bahrain |
| Belgium | Brazil | Canada | Chile | | Belgium | Brazil | Canada | Chile |
| China | Colombia | Costa Rica | Czech Republic | | China | Colombia | Costa Rica | Czech Republic |
@ -156,12 +168,14 @@ You can specify the following countries when searching on Indeed (use the exact
--- ---
**Q: Encountering issues with your queries?** **Q: Encountering issues with your queries?**
**A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems persist, [submit an issue](https://github.com/cullenwatson/JobSpy/issues). **A:** Try reducing the number of `results_wanted` and/or broadening the filters. If problems
persist, [submit an issue](https://github.com/cullenwatson/JobSpy/issues).
--- ---
**Q: Received a response code 429?** **Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, **LinkedIn** is particularly aggressive with blocking. We recommend: **A:** This indicates that you have been blocked by the job board site for sending too many requests. Currently, *
*LinkedIn** is particularly aggressive with blocking. We recommend:
- Waiting a few seconds between requests. - Waiting a few seconds between requests.
- Trying a VPN or proxy to change your IP address. - Trying a VPN or proxy to change your IP address.
@ -170,6 +184,7 @@ You can specify the following countries when searching on Indeed (use the exact
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?** **Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include: **A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
- Upgrade to a newer version of MacOS - Upgrade to a newer version of MacOS
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes - Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes

31
examples/JobSpy_Demo.py Normal file
View File

@ -0,0 +1,31 @@
from jobspy import scrape_jobs
import pandas as pd
jobs: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer",
location="Dallas, TX",
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
country_indeed='USA',
offset=25 # start jobs from an offset (use if search failed and want to continue)
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
)
# formatting for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
# 1: output to console
print(jobs)
# 2: output to .csv
jobs.to_csv('./jobs.csv', index=False)
print('outputted to jobs.csv')
# 3: output to .xlsx
# jobs.to_xlsx('jobs.xlsx', index=False)
# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
# display(jobs)

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.7" version = "1.1.8"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/cullenwatson/JobSpy"

View File

@ -1,8 +1,7 @@
import pandas as pd import pandas as pd
import concurrent.futures import concurrent.futures
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict, Optional from typing import List, Tuple, Optional
import traceback
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@ -38,6 +37,7 @@ def scrape_jobs(
country_indeed: str = "usa", country_indeed: str = "usa",
hyperlinks: bool = False, hyperlinks: bool = False,
proxy: Optional[str] = None, proxy: Optional[str] = None,
offset: Optional[int] = 0
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Simultaneously scrapes job data from multiple job sites. Simultaneously scrapes job data from multiple job sites.
@ -49,8 +49,8 @@ def scrape_jobs(
if value_str in job_type.value: if value_str in job_type.value:
return job_type return job_type
raise Exception(f"Invalid job type: {value_str}") raise Exception(f"Invalid job type: {value_str}")
job_type = get_enum_from_value(job_type) if job_type else None
job_type = get_enum_from_value(job_type) if job_type else None
if type(site_name) == str: if type(site_name) == str:
site_type = [_map_str_to_site(site_name)] site_type = [_map_str_to_site(site_name)]
@ -72,6 +72,7 @@ def scrape_jobs(
job_type=job_type, job_type=job_type,
easy_apply=easy_apply, easy_apply=easy_apply,
results_wanted=results_wanted, results_wanted=results_wanted,
offset=offset
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
@ -149,17 +150,19 @@ def scrape_jobs(
if jobs_dfs: if jobs_dfs:
jobs_df = pd.concat(jobs_dfs, ignore_index=True) jobs_df = pd.concat(jobs_dfs, ignore_index=True)
desired_order: List[str] = [ desired_order: List[str] = [
"job_url_hyper" if hyperlinks else "job_url",
"site", "site",
"title", "title",
"company", "company",
"location", "location",
"date_posted",
"job_type", "job_type",
"date_posted",
"interval", "interval",
"benefits",
"min_amount", "min_amount",
"max_amount", "max_amount",
"currency", "currency",
"job_url_hyper" if hyperlinks else "job_url", "emails",
"description", "description",
] ]
jobs_formatted_df = jobs_df[desired_order] jobs_formatted_df = jobs_df[desired_order]

View File

@ -170,7 +170,7 @@ class CompensationInterval(Enum):
class Compensation(BaseModel): class Compensation(BaseModel):
interval: CompensationInterval interval: Optional[CompensationInterval] = None
min_amount: int = None min_amount: int = None
max_amount: int = None max_amount: int = None
currency: Optional[str] = "USD" currency: Optional[str] = "USD"
@ -186,6 +186,8 @@ class JobPost(BaseModel):
job_type: Optional[JobType] = None job_type: Optional[JobType] = None
compensation: Optional[Compensation] = None compensation: Optional[Compensation] = None
date_posted: Optional[date] = None date_posted: Optional[date] = None
benefits: Optional[str] = None
emails: Optional[list[str]] = None
class JobResponse(BaseModel): class JobResponse(BaseModel):

View File

@ -18,6 +18,7 @@ class ScraperInput(BaseModel):
is_remote: bool = False is_remote: bool = False
job_type: Optional[JobType] = None job_type: Optional[JobType] = None
easy_apply: bool = None # linkedin easy_apply: bool = None # linkedin
offset: int = 0
results_wanted: int = 15 results_wanted: int = 15

View File

@ -8,7 +8,6 @@ import re
import math import math
import io import io
import json import json
import traceback
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
@ -27,7 +26,8 @@ from ...jobs import (
JobResponse, JobResponse,
JobType, JobType,
) )
from .. import Scraper, ScraperInput, Site, Country from .. import Scraper, ScraperInput, Site
from ...utils import extract_emails_from_text
class IndeedScraper(Scraper): class IndeedScraper(Scraper):
@ -35,6 +35,8 @@ class IndeedScraper(Scraper):
""" """
Initializes IndeedScraper with the Indeed job search url Initializes IndeedScraper with the Indeed job search url
""" """
self.url = None
self.country = None
site = Site(Site.INDEED) site = Site(Site.INDEED)
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
@ -61,7 +63,7 @@ class IndeedScraper(Scraper):
"q": scraper_input.search_term, "q": scraper_input.search_term,
"l": scraper_input.location, "l": scraper_input.location,
"filter": 0, "filter": 0,
"start": 0 + page * 10, "start": scraper_input.offset + page * 10,
} }
if scraper_input.distance: if scraper_input.distance:
params["radius"] = scraper_input.distance params["radius"] = scraper_input.distance
@ -76,7 +78,7 @@ class IndeedScraper(Scraper):
params["sc"] = "0kf:" + "".join(sc_values) + ";" params["sc"] = "0kf:" + "".join(sc_values) + ";"
try: try:
response = session.get( response = session.get(
self.url + "/jobs", f"{self.url}/jobs",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxy=self.proxy, proxy=self.proxy,
@ -137,9 +139,10 @@ class IndeedScraper(Scraper):
date_posted = date_posted.strftime("%Y-%m-%d") date_posted = date_posted.strftime("%Y-%m-%d")
description = self.get_description(job_url, session) description = self.get_description(job_url, session)
emails = extract_emails_from_text(description)
with io.StringIO(job["snippet"]) as f: with io.StringIO(job["snippet"]) as f:
soup = BeautifulSoup(f, "html.parser") soup_io = BeautifulSoup(f, "html.parser")
li_elements = soup.find_all("li") li_elements = soup_io.find_all("li")
if description is None and li_elements: if description is None and li_elements:
description = " ".join(li.text for li in li_elements) description = " ".join(li.text for li in li_elements)
@ -152,6 +155,7 @@ class IndeedScraper(Scraper):
state=job.get("jobLocationState"), state=job.get("jobLocationState"),
country=self.country, country=self.country,
), ),
emails=extract_emails_from_text(description),
job_type=job_type, job_type=job_type,
compensation=compensation, compensation=compensation,
date_posted=date_posted, date_posted=date_posted,
@ -206,7 +210,7 @@ class IndeedScraper(Scraper):
) )
return job_response return job_response
def get_description(self, job_page_url: str, session: tls_client.Session) -> str: def get_description(self, job_page_url: str, session: tls_client.Session) -> Optional[str]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
@ -249,13 +253,17 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][0].get("label") label = taxonomy["attributes"][0].get("label")
if label: if label:
job_type_str = label.replace("-", "").replace(" ", "").lower() job_type_str = label.replace("-", "").replace(" ", "").lower()
return IndeedScraper.get_enum_from_value(job_type_str) return IndeedScraper.get_enum_from_job_type(job_type_str)
return None return None
@staticmethod @staticmethod
def get_enum_from_value(value_str): def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType: for job_type in JobType:
if value_str in job_type.value: """
for job_type in JobType:
if job_type_str in job_type.value:
return job_type return job_type
return None return None

View File

@ -4,14 +4,16 @@ jobspy.scrapers.linkedin
This module contains routines to scrape LinkedIn. This module contains routines to scrape LinkedIn.
""" """
from typing import Optional, Tuple from typing import Optional
from datetime import datetime from datetime import datetime
import traceback
import requests import requests
from requests.exceptions import Timeout, ProxyError import time
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
@ -20,17 +22,20 @@ from ...jobs import (
Location, Location,
JobResponse, JobResponse,
JobType, JobType,
Compensation,
CompensationInterval,
) )
from ...utils import extract_emails_from_text
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
MAX_RETRIES = 3
DELAY = 10
def __init__(self, proxy: Optional[str] = None): def __init__(self, proxy: Optional[str] = None):
""" """
Initializes LinkedInScraper with the LinkedIn job search url Initializes LinkedInScraper with the LinkedIn job search url
""" """
site = Site(Site.LINKEDIN) site = Site(Site.LINKEDIN)
self.country = "worldwide"
self.url = "https://www.linkedin.com" self.url = "https://www.linkedin.com"
super().__init__(site, proxy=proxy) super().__init__(site, proxy=proxy)
@ -40,12 +45,12 @@ class LinkedInScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
self.country = "worldwide"
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_urls = set() seen_urls = set()
page, processed_jobs, job_count = 0, 0, 0 url_lock = Lock()
page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
def job_type_code(job_type): def job_type_code(job_type_enum):
mapping = { mapping = {
JobType.FULL_TIME: "F", JobType.FULL_TIME: "F",
JobType.PART_TIME: "P", JobType.PART_TIME: "P",
@ -54,10 +59,9 @@ class LinkedInScraper(Scraper):
JobType.TEMPORARY: "T", JobType.TEMPORARY: "T",
} }
return mapping.get(job_type, "") return mapping.get(job_type_enum, "")
with requests.Session() as session: while len(job_list) < scraper_input.results_wanted and page < 1000:
while len(job_list) < scraper_input.results_wanted:
params = { params = {
"keywords": scraper_input.search_term, "keywords": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -66,78 +70,101 @@ class LinkedInScraper(Scraper):
"f_JT": job_type_code(scraper_input.job_type) "f_JT": job_type_code(scraper_input.job_type)
if scraper_input.job_type if scraper_input.job_type
else None, else None,
"pageNum": page, "pageNum": 0,
page: page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
} }
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
try: try:
response = session.get( response = requests.get(
f"{self.url}/jobs/search", f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params, params=params,
allow_redirects=True, allow_redirects=True,
proxies=self.proxy, proxies=self.proxy,
timeout=10, timeout=10,
) )
response.raise_for_status() response.raise_for_status()
break
except requests.HTTPError as e: except requests.HTTPError as e:
raise LinkedInException( if hasattr(e, 'response') and e.response is not None:
f"bad response status code: {response.status_code}" if e.response.status_code == 429:
) time.sleep(self.DELAY)
retries += 1
continue
else:
raise LinkedInException(f"bad response status code: {e.response.status_code}")
else:
raise
except ProxyError as e: except ProxyError as e:
raise LinkedInException("bad proxy") raise LinkedInException("bad proxy")
except (ProxyError, Exception) as e: except Exception as e:
raise LinkedInException(str(e)) raise LinkedInException(str(e))
else:
# Raise an exception if the maximum number of retries is reached
raise LinkedInException("Max retries reached, failed to get a valid response")
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
if page == 0: with ThreadPoolExecutor(max_workers=5) as executor:
job_count_text = soup.find( futures = []
"span", class_="results-context-header__job-count" for job_card in soup.find_all("div", class_="base-search-card"):
).text job_url = None
job_count = int("".join(filter(str.isdigit, job_count_text))) href_tag = job_card.find("a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
for job_card in soup.find_all( href = href_tag.attrs["href"].split("?")[0]
"div", job_id = href.split("-")[-1]
class_="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
):
processed_jobs += 1
data_entity_urn = job_card.get("data-entity-urn", "")
job_id = (
data_entity_urn.split(":")[-1] if data_entity_urn else "N/A"
)
job_url = f"{self.url}/jobs/view/{job_id}" job_url = f"{self.url}/jobs/view/{job_id}"
with url_lock:
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
job_info = job_card.find("div", class_="base-search-card__info")
if job_info is None:
continue
title_tag = job_info.find("h3", class_="base-search-card__title")
title = title_tag.text.strip() if title_tag else "N/A"
company_tag = job_info.find("a", class_="hidden-nested-link") futures.append(executor.submit(self.process_job, job_card, job_url))
company = company_tag.text.strip() if company_tag else "N/A"
metadata_card = job_info.find( for future in as_completed(futures):
"div", class_="base-search-card__metadata" try:
) job_post = future.result()
location: Location = self.get_location(metadata_card) if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
page += 25
datetime_tag = metadata_card.find( job_list = job_list[: scraper_input.results_wanted]
"time", class_="job-search-card__listdate" return JobResponse(jobs=job_list)
)
description, job_type = self.get_description(job_url) def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
if datetime_tag: title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata")
location = self.get_location(metadata_card)
datetime_tag = metadata_card.find("time", class_="job-search-card__listdate") if metadata_card else None
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
try: try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e: except Exception as e:
date_posted = None date_posted = None
else: benefits_tag = job_card.find("span", class_="result-benefits__text")
date_posted = None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
job_post = JobPost( description, job_type = self.get_job_description(job_url)
return JobPost(
title=title, title=title,
description=description, description=description,
company_name=company, company_name=company,
@ -145,26 +172,12 @@ class LinkedInScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type, job_type=job_type,
compensation=Compensation( benefits=benefits,
interval=CompensationInterval.YEARLY, currency=None emails=extract_emails_from_text(description)
),
) )
job_list.append(job_post)
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break
if processed_jobs >= job_count:
break
if len(job_list) >= scraper_input.results_wanted:
break
page += 1 def get_job_description(self, job_page_url: str) -> tuple[None, None] | tuple[
str | None, tuple[str | None, JobType | None]]:
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def get_description(self, job_page_url: str) -> Optional[str]:
""" """
Retrieves job description by going to the job page url Retrieves job description by going to the job page url
:param job_page_url: :param job_page_url:
@ -181,19 +194,19 @@ class LinkedInScraper(Scraper):
"div", class_=lambda x: x and "show-more-less-html__markup" in x "div", class_=lambda x: x and "show-more-less-html__markup" in x
) )
text_content = None description = None
if div_content: if div_content:
text_content = " ".join(div_content.get_text().split()).strip() description = " ".join(div_content.get_text().split()).strip()
def get_job_type( def get_job_type(
soup: BeautifulSoup, soup_job_type: BeautifulSoup,
) -> Tuple[Optional[str], Optional[JobType]]: ) -> JobType | None:
""" """
Gets the job type from job page Gets the job type from job page
:param soup: :param soup_job_type:
:return: JobType :return: JobType
""" """
h3_tag = soup.find( h3_tag = soup_job_type.find(
"h3", "h3",
class_="description__job-criteria-subheader", class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text, string=lambda text: "Employment type" in text,
@ -212,7 +225,7 @@ class LinkedInScraper(Scraper):
return LinkedInScraper.get_enum_from_value(employment_type) return LinkedInScraper.get_enum_from_value(employment_type)
return text_content, get_job_type(soup) return description, get_job_type(soup)
@staticmethod @staticmethod
def get_enum_from_value(value_str): def get_enum_from_value(value_str):

View File

@ -7,10 +7,9 @@ This module contains routines to scrape ZipRecruiter.
import math import math
import json import json
import re import re
import traceback from datetime import datetime, date
from datetime import datetime from typing import Optional, Tuple, Any
from typing import Optional, Tuple from urllib.parse import urlparse, parse_qs, urlunparse
from urllib.parse import urlparse, parse_qs
import tls_client import tls_client
import requests import requests
@ -29,6 +28,7 @@ from ...jobs import (
JobType, JobType,
Country, Country,
) )
from ...utils import extract_emails_from_text
class ZipRecruiterScraper(Scraper): class ZipRecruiterScraper(Scraper):
@ -48,18 +48,17 @@ class ZipRecruiterScraper(Scraper):
def find_jobs_in_page( def find_jobs_in_page(
self, scraper_input: ScraperInput, page: int self, scraper_input: ScraperInput, page: int
) -> tuple[list[JobPost], int | None]: ) -> list[JobPost]:
""" """
Scrapes a page of ZipRecruiter for jobs with scraper_input criteria Scrapes a page of ZipRecruiter for jobs with scraper_input criteria
:param scraper_input: :param scraper_input:
:param page: :param page:
:param session: :return: jobs found on page
:return: jobs found on page, total number of jobs found for search
""" """
job_list: list[JobPost] = [] job_list: list[JobPost] = []
try: try:
response = self.session.get( response = self.session.get(
self.url + "/jobs-search", f"{self.url}/jobs-search",
headers=ZipRecruiterScraper.headers(), headers=ZipRecruiterScraper.headers(),
params=ZipRecruiterScraper.add_params(scraper_input, page), params=ZipRecruiterScraper.add_params(scraper_input, page),
allow_redirects=True, allow_redirects=True,
@ -92,8 +91,6 @@ class ZipRecruiterScraper(Scraper):
page_variant = "html_1" page_variant = "html_1"
jobs_list = soup.find_all("li", {"class": "job-listing"}) jobs_list = soup.find_all("li", {"class": "job-listing"})
# print('type 1 html', len(jobs_list)) # print('type 1 html', len(jobs_list))
# with open("zip_method_8.html", "w") as f:
# f.write(soup.prettify())
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
if page_variant == "javascript": if page_variant == "javascript":
@ -119,8 +116,9 @@ class ZipRecruiterScraper(Scraper):
:param scraper_input: :param scraper_input:
:return: job_response :return: job_response
""" """
start_page = (scraper_input.offset // self.jobs_per_page) + 1 if scraper_input.offset else 1
#: get first page to initialize session #: get first page to initialize session
job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, 1) job_list: list[JobPost] = self.find_jobs_in_page(scraper_input, start_page)
pages_to_process = max( pages_to_process = max(
3, math.ceil(scraper_input.results_wanted / self.jobs_per_page) 3, math.ceil(scraper_input.results_wanted / self.jobs_per_page)
) )
@ -128,7 +126,7 @@ class ZipRecruiterScraper(Scraper):
with ThreadPoolExecutor(max_workers=10) as executor: with ThreadPoolExecutor(max_workers=10) as executor:
futures: list[Future] = [ futures: list[Future] = [
executor.submit(self.find_jobs_in_page, scraper_input, page) executor.submit(self.find_jobs_in_page, scraper_input, page)
for page in range(2, pages_to_process + 1) for page in range(start_page + 1, start_page + pages_to_process + 2)
] ]
for future in futures: for future in futures:
@ -144,8 +142,9 @@ class ZipRecruiterScraper(Scraper):
Parses a job from the job content tag Parses a job from the job content tag
:param job: BeautifulSoup Tag for one job post :param job: BeautifulSoup Tag for one job post
:return JobPost :return JobPost
TODO this method isnt finished due to not encountering this type of html often
""" """
job_url = job.find("a", {"class": "job_link"})["href"] job_url = self.cleanurl(job.find("a", {"class": "job_link"})["href"])
if job_url in self.seen_urls: if job_url in self.seen_urls:
return None return None
@ -153,7 +152,7 @@ class ZipRecruiterScraper(Scraper):
company = job.find("a", {"class": "company_name"}).text.strip() company = job.find("a", {"class": "company_name"}).text.strip()
description, updated_job_url = self.get_description(job_url) description, updated_job_url = self.get_description(job_url)
job_url = updated_job_url if updated_job_url else job_url # job_url = updated_job_url if updated_job_url else job_url
if description is None: if description is None:
description = job.find("p", {"class": "job_snippet"}).text.strip() description = job.find("p", {"class": "job_snippet"}).text.strip()
@ -176,6 +175,7 @@ class ZipRecruiterScraper(Scraper):
compensation=ZipRecruiterScraper.get_compensation(job), compensation=ZipRecruiterScraper.get_compensation(job),
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
emails=extract_emails_from_text(description),
) )
return job_post return job_post
@ -185,12 +185,12 @@ class ZipRecruiterScraper(Scraper):
:param job: BeautifulSoup Tag for one job post :param job: BeautifulSoup Tag for one job post
:return JobPost :return JobPost
""" """
job_url = job.find("a", class_="job_link")["href"] job_url = self.cleanurl(job.find("a", class_="job_link")["href"])
title = job.find("h2", class_="title").text title = job.find("h2", class_="title").text
company = job.find("a", class_="company_name").text.strip() company = job.find("a", class_="company_name").text.strip()
description, updated_job_url = self.get_description(job_url) description, updated_job_url = self.get_description(job_url)
job_url = updated_job_url if updated_job_url else job_url # job_url = updated_job_url if updated_job_url else job_url
if description is None: if description is None:
description = job.find("p", class_="job_snippet").get_text().strip() description = job.find("p", class_="job_snippet").get_text().strip()
@ -221,10 +221,10 @@ class ZipRecruiterScraper(Scraper):
def process_job_javascript(self, job: dict) -> JobPost: def process_job_javascript(self, job: dict) -> JobPost:
title = job.get("Title") title = job.get("Title")
job_url = job.get("JobURL") job_url = self.cleanurl(job.get("JobURL"))
description, updated_job_url = self.get_description(job_url) description, updated_job_url = self.get_description(job_url)
job_url = updated_job_url if updated_job_url else job_url # job_url = updated_job_url if updated_job_url else job_url
if description is None: if description is None:
description = BeautifulSoup( description = BeautifulSoup(
job.get("Snippet", "").strip(), "html.parser" job.get("Snippet", "").strip(), "html.parser"
@ -272,7 +272,6 @@ class ZipRecruiterScraper(Scraper):
date_posted = date_posted_obj.date() date_posted = date_posted_obj.date()
else: else:
date_posted = date.today() date_posted = date.today()
job_url = job.get("JobURL")
return JobPost( return JobPost(
title=title, title=title,
@ -323,7 +322,7 @@ class ZipRecruiterScraper(Scraper):
return None, response.url return None, response.url
@staticmethod @staticmethod
def add_params(scraper_input, page) -> Optional[str]: def add_params(scraper_input, page) -> dict[str, str | Any]:
params = { params = {
"search": scraper_input.search_term, "search": scraper_input.search_term,
"location": scraper_input.location, "location": scraper_input.location,
@ -368,7 +367,7 @@ class ZipRecruiterScraper(Scraper):
return CompensationInterval(interval_str) return CompensationInterval(interval_str)
@staticmethod @staticmethod
def get_date_posted(job: BeautifulSoup) -> Optional[datetime.date]: def get_date_posted(job: Tag) -> Optional[datetime.date]:
""" """
Extracts the date a job was posted Extracts the date a job was posted
:param job :param job
@ -394,7 +393,7 @@ class ZipRecruiterScraper(Scraper):
return None return None
@staticmethod @staticmethod
def get_compensation(job: BeautifulSoup) -> Optional[Compensation]: def get_compensation(job: Tag) -> Optional[Compensation]:
""" """
Parses the compensation tag from the job BeautifulSoup object Parses the compensation tag from the job BeautifulSoup object
:param job :param job
@ -435,7 +434,7 @@ class ZipRecruiterScraper(Scraper):
return create_compensation_object(pay) return create_compensation_object(pay)
@staticmethod @staticmethod
def get_location(job: BeautifulSoup) -> Location: def get_location(job: Tag) -> Location:
""" """
Extracts the job location from BeatifulSoup object Extracts the job location from BeatifulSoup object
:param job: :param job:
@ -462,3 +461,9 @@ class ZipRecruiterScraper(Scraper):
return { return {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
} }
@staticmethod
def cleanurl(url):
parsed_url = urlparse(url)
return urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, '', ''))

View File

@ -1,4 +1,5 @@
from ..jobspy import scrape_jobs from ..jobspy import scrape_jobs
import pandas as pd
def test_all(): def test_all():
@ -7,4 +8,5 @@ def test_all():
search_term="software engineer", search_term="software engineer",
results_wanted=5, results_wanted=5,
) )
assert result is not None and result.errors.empty is True
assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame"

View File

@ -1,4 +1,5 @@
from ..jobspy import scrape_jobs from ..jobspy import scrape_jobs
import pandas as pd
def test_indeed(): def test_indeed():
@ -6,4 +7,4 @@ def test_indeed():
site_name="indeed", site_name="indeed",
search_term="software engineer", search_term="software engineer",
) )
assert result is not None and result.errors.empty is True assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame"

View File

@ -1,4 +1,5 @@
from ..jobspy import scrape_jobs from ..jobspy import scrape_jobs
import pandas as pd
def test_linkedin(): def test_linkedin():
@ -6,4 +7,4 @@ def test_linkedin():
site_name="linkedin", site_name="linkedin",
search_term="software engineer", search_term="software engineer",
) )
assert result is not None and result.errors.empty is True assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame"

View File

@ -1,4 +1,5 @@
from ..jobspy import scrape_jobs from ..jobspy import scrape_jobs
import pandas as pd
def test_ziprecruiter(): def test_ziprecruiter():
@ -7,4 +8,4 @@ def test_ziprecruiter():
search_term="software engineer", search_term="software engineer",
) )
assert result is not None and result.errors.empty is True assert isinstance(result, pd.DataFrame) and not result.empty, "Result should be a non-empty DataFrame"