Compare commits

..

9 Commits

Author SHA1 Message Date
Zachary Hampton
a37e7f235e Merge pull request #42 from cullenwatson/fix/class-type-error
- refactor & #41 bug fix
2023-09-06 16:33:59 -07:00
Zachary Hampton
690739e858 - refactor & #41 bug fix 2023-09-06 16:32:51 -07:00
Cullen Watson
43eb2fe0e8 remove gitattr 2023-09-06 11:34:51 -05:00
Cullen Watson
e50227bba6 clear output jupyter 2023-09-06 11:32:32 -05:00
Cullen Watson
45c2d76e15 add yt guide 2023-09-06 11:26:55 -05:00
Cullen Watson
fd883178be Thread sites (#40) 2023-09-06 09:47:11 -05:00
Cullen Watson
70e2218c67 reduce size of jupyter notebook 2023-09-05 13:09:18 -05:00
Cullen Watson
d6947ecdd7 Update README.md 2023-09-05 13:03:32 -05:00
Cullen Watson
5191658562 Update README.md 2023-09-05 12:27:00 -05:00
12 changed files with 261 additions and 1391 deletions

File diff suppressed because it is too large Load Diff

140
README.md
View File

@@ -7,10 +7,16 @@
- Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously - Scrapes job postings from **LinkedIn**, **Indeed** & **ZipRecruiter** simultaneously
- Aggregates the job postings in a Pandas DataFrame - Aggregates the job postings in a Pandas DataFrame
[Video Guide for JobSpy](https://www.youtube.com/watch?v=-yS3mgI5H-4)
![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57) ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
### Installation ### Installation
`pip install python-jobspy` ```
pip install python-jobspy
```
_Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) required_
@@ -20,31 +26,30 @@
from jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd
jobs: pd.DataFrame = scrape_jobs( result: pd.DataFrame = scrape_jobs(
site_name=["indeed", "linkedin", "zip_recruiter"], site_name=["indeed", "linkedin", "zip_recruiter"],
search_term="software engineer", search_term="software engineer",
location="Dallas, TX", location="Dallas, TX",
results_wanted=10, results_wanted=10,
# country: only needed for indeed country_indeed='USA' # only needed for indeed
country='USA'
) )
if jobs.empty: pd.set_option('display.max_columns', None)
print("No jobs found.") pd.set_option('display.max_rows', None)
else: pd.set_option('display.width', None)
#1 print pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc
print(jobs)
#2 display in Jupyter Notebook #1 output
#display(jobs) print(result.jobs)
print(result.errors)
#3 output to .csv #2 display in Jupyter Notebook
#jobs.to_csv('jobs.csv', index=False) #display(result.jobs)
#display(result.errors)
#3 output to .csv
#result.jobs.to_csv('result.jobs.csv', index=False)
``` ```
### Output ### Output
@@ -68,8 +73,8 @@ Optional
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (enum): fulltime, parttime, internship, contract
├── is_remote (bool) ├── is_remote (bool)
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type' ├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
├── easy_apply (bool): filters for jobs on LinkedIn that have the 'Easy Apply' option ├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
├── country (enum): uses the corresponding subdomain on Indeed (e.g. Canada on Indeed is ca.indeed.com ├── country_indeed (enum): filters the country on Indeed
``` ```
@@ -77,20 +82,20 @@ Optional
```plaintext ```plaintext
JobPost JobPost
├── title (str) ├── title (str)
├── company_name (str) ├── company (str)
├── job_url (str) ├── job_url (str)
├── location (object) ├── location (object)
│ ├── country (str) │ ├── country (str)
│ ├── city (str) │ ├── city (str)
│ ├── state (str) │ ├── state (str)
├── description (str) ├── description (str)
├── job_type (enum) ├── job_type (enum): fulltime, parttime, internship, contract
├── compensation (object) ├── compensation (object)
│ ├── interval (CompensationInterval): yearly, monthly, weekly, daily, hourly │ ├── interval (enum): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int) │ ├── min_amount (int)
│ ├── max_amount (int) │ ├── max_amount (int)
│ └── currency (str) │ └── currency (enum)
└── date_posted (datetime) └── date_posted (date)
``` ```
## Supported Countries for Job Searching ## Supported Countries for Job Searching
@@ -98,80 +103,37 @@ JobPost
### **LinkedIn** ### **LinkedIn**
LinkedIn searches globally. Use the `location` parameter LinkedIn searches globally & uses only the `location` parameter
### **ZipRecruiter** ### **ZipRecruiter**
ZipRecruiter searches for jobs in US/Canada. Use the `location` parameter ZipRecruiter searches for jobs in US/Canada & uses only the `location` parameter
### **Indeed** ### **Indeed**
For Indeed, you `location` along with `country` param For Indeed, the `country_indeed` parameter is required. Additionally, use the `location` parameter and include the city or state if necessary.
You can specify the following countries when searching on Indeed (use the exact name): You can specify the following countries when searching on Indeed (use the exact name):
- Argentina
- Australia | | | | |
- Austria |------|------|------|------|
- Bahrain | Argentina | Australia | Austria | Bahrain |
- Belgium | Belgium | Brazil | Canada | Chile |
- Brazil | China | Colombia | Costa Rica | Czech Republic |
- Canada | Denmark | Ecuador | Egypt | Finland |
- Chile | France | Germany | Greece | Hong Kong |
- China | Hungary | India | Indonesia | Ireland |
- Colombia | Israel | Italy | Japan | Kuwait |
- Costa Rica | Luxembourg | Malaysia | Mexico | Morocco |
- Czech Republic | Netherlands | New Zealand | Nigeria | Norway |
- Denmark | Oman | Pakistan | Panama | Peru |
- Ecuador | Philippines | Poland | Portugal | Qatar |
- Egypt | Romania | Saudi Arabia | Singapore | South Africa |
- Finland | South Korea | Spain | Sweden | Switzerland |
- France | Taiwan | Thailand | Turkey | Ukraine |
- Germany | United Arab Emirates | UK | USA | Uruguay |
- Greece | Venezuela | Vietnam | | |
- Hong Kong
- Hungary
- India
- Indonesia
- Ireland
- Israel
- Italy
- Japan
- Kuwait
- Luxembourg
- Malaysia
- Mexico
- Morocco
- Netherlands
- New Zealand
- Nigeria
- Norway
- Oman
- Pakistan
- Panama
- Peru
- Philippines
- Poland
- Portugal
- Qatar
- Romania
- Saudi Arabia
- Singapore
- South Africa
- South Korea
- Spain
- Sweden
- Switzerland
- Taiwan
- Thailand
- Turkey
- Ukraine
- United Arab Emirates
- UK
- USA
- Uruguay
- Venezuela
- Vietnam
## Frequently Asked Questions ## Frequently Asked Questions

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.0" version = "1.1.2"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
readme = "README.md" readme = "README.md"

View File

@@ -1,5 +1,7 @@
import pandas as pd import pandas as pd
from typing import List, Tuple import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, NamedTuple, Dict
from .jobs import JobType, Location from .jobs import JobType, Location
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@@ -7,7 +9,6 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import ScraperInput, Site, JobResponse, Country from .scrapers import ScraperInput, Site, JobResponse, Country
SCRAPER_MAPPING = { SCRAPER_MAPPING = {
Site.LINKEDIN: LinkedInScraper, Site.LINKEDIN: LinkedInScraper,
Site.INDEED: IndeedScraper, Site.INDEED: IndeedScraper,
@@ -15,12 +16,17 @@ SCRAPER_MAPPING = {
} }
class ScrapeResults(NamedTuple):
jobs: pd.DataFrame
errors: pd.DataFrame
def _map_str_to_site(site_name: str) -> Site: def _map_str_to_site(site_name: str) -> Site:
return Site[site_name.upper()] return Site[site_name.upper()]
def scrape_jobs( def scrape_jobs(
site_name: str | Site | List[Site], site_name: str | List[str] | Site | List[Site],
search_term: str, search_term: str,
location: str = "", location: str = "",
distance: int = None, distance: int = None,
@@ -28,19 +34,21 @@ def scrape_jobs(
job_type: JobType = None, job_type: JobType = None,
easy_apply: bool = False, # linkedin easy_apply: bool = False, # linkedin
results_wanted: int = 15, results_wanted: int = 15,
country: str = "usa", country_indeed: str = "usa",
) -> pd.DataFrame: hyperlinks: bool = False
) -> ScrapeResults:
""" """
Asynchronously scrapes job data from multiple job sites. Asynchronously scrapes job data from multiple job sites.
:return: results_wanted: pandas dataframe containing job data :return: results_wanted: pandas dataframe containing job data
""" """
if type(site_name) == str: if type(site_name) == str:
site_name = _map_str_to_site(site_name) site_type = [_map_str_to_site(site_name)]
else: #: if type(site_name) == list
site_type = [_map_str_to_site(site) if type(site) == str else site_name for site in site_name]
country_enum = Country.from_string(country) country_enum = Country.from_string(country_indeed)
site_type = [site_name] if type(site_name) == Site else site_name
scraper_input = ScraperInput( scraper_input = ScraperInput(
site_type=site_type, site_type=site_type,
country=country_enum, country=country_enum,
@@ -54,22 +62,35 @@ def scrape_jobs(
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] try:
scraper = scraper_class() scraper_class = SCRAPER_MAPPING[site]
scraped_data: JobResponse = scraper.scrape(scraper_input) scraper = scraper_class()
scraped_data: JobResponse = scraper.scrape(scraper_input)
except Exception as e:
scraped_data = JobResponse(jobs=[], error=str(e), success=False)
return site.value, scraped_data return site.value, scraped_data
results = {} results, errors = {}, {}
for site in scraper_input.site_type:
def worker(site):
site_value, scraped_data = scrape_site(site) site_value, scraped_data = scrape_site(site)
results[site_value] = scraped_data return site_value, scraped_data
with ThreadPoolExecutor() as executor:
future_to_site = {executor.submit(worker, site): site for site in scraper_input.site_type}
for future in concurrent.futures.as_completed(future_to_site):
site_value, scraped_data = future.result()
results[site_value] = scraped_data
if scraped_data.error:
errors[site_value] = scraped_data.error
dfs = [] dfs = []
for site, job_response in results.items(): for site, job_response in results.items():
for job in job_response.jobs: for job in job_response.jobs:
data = job.dict() data = job.dict()
data["job_url_hyper"] = f'<a href="{data["job_url"]}">{data["job_url"]}</a>'
data["site"] = site data["site"] = site
data["company"] = data["company_name"] data["company"] = data["company_name"]
if data["job_type"]: if data["job_type"]:
@@ -99,23 +120,41 @@ def scrape_jobs(
job_df = pd.DataFrame([data]) job_df = pd.DataFrame([data])
dfs.append(job_df) dfs.append(job_df)
errors_list = [(key, value) for key, value in errors.items()]
errors_df = pd.DataFrame(errors_list, columns=["Site", "Error"])
if dfs: if dfs:
df = pd.concat(dfs, ignore_index=True) df = pd.concat(dfs, ignore_index=True)
desired_order = [ if hyperlinks:
"site", desired_order = [
"title", "site",
"company", "title",
"location", "company",
"job_type", "location",
"interval", "job_type",
"min_amount", "interval",
"max_amount", "min_amount",
"currency", "max_amount",
"job_url", "currency",
"description", "job_url_hyper",
] "description",
]
else:
desired_order = [
"site",
"title",
"company",
"location",
"job_type",
"interval",
"min_amount",
"max_amount",
"currency",
"job_url",
"description",
]
df = df[desired_order] df = df[desired_order]
else: else:
df = pd.DataFrame() df = pd.DataFrame()
return df return ScrapeResults(jobs=df, errors=errors_df)

View File

@@ -27,14 +27,6 @@ class ScraperInput(BaseModel):
results_wanted: int = 15 results_wanted: int = 15
class CommonResponse(BaseModel):
status: Optional[str]
error: Optional[str]
linkedin: Optional[Any] = None
indeed: Optional[Any] = None
zip_recruiter: Optional[Any] = None
class Scraper: class Scraper:
def __init__(self, site: Site): def __init__(self, site: Site):
self.site = site self.site = site

View File

@@ -78,9 +78,7 @@ class IndeedScraper(Scraper):
raise StatusException(response.status_code) raise StatusException(response.status_code)
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
with open("text2.html", "w", encoding="utf-8") as f: if "did not match any jobs" in response.text:
f.write(str(soup))
if "did not match any jobs" in str(soup):
raise ParsingException("Search did not match any jobs") raise ParsingException("Search did not match any jobs")
jobs = IndeedScraper.parse_jobs( jobs = IndeedScraper.parse_jobs(
@@ -197,7 +195,6 @@ class IndeedScraper(Scraper):
error=f"Indeed failed to parse response: {e}", error=f"Indeed failed to parse response: {e}",
) )
except Exception as e: except Exception as e:
print(f"LinkedIn failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse( return JobResponse(
success=False, success=False,
error=f"Indeed failed to scrape: {e}", error=f"Indeed failed to scrape: {e}",
@@ -230,11 +227,9 @@ class IndeedScraper(Scraper):
formatted_url, allow_redirects=True, timeout_seconds=5 formatted_url, allow_redirects=True, timeout_seconds=5
) )
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.")
return None return None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
print("status code not in range")
return None return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ raw_description = response.json()["body"]["jobInfoWrapperModel"][

View File

@@ -1,7 +1,9 @@
from typing import Optional, Tuple from typing import Optional, Tuple
from datetime import datetime from datetime import datetime
import traceback
import requests import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
@@ -67,9 +69,12 @@ class LinkedInScraper(Scraper):
) )
if response.status_code != 200: if response.status_code != 200:
reason = ' (too many requests)' if response.status_code == 429 else ''
return JobResponse( return JobResponse(
success=False, success=False,
error=f"Response returned {response.status_code}", error=f"LinkedIn returned {response.status_code} {reason}",
jobs=job_list,
total_results=job_count,
) )
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@@ -113,7 +118,10 @@ class LinkedInScraper(Scraper):
description, job_type = LinkedInScraper.get_description(job_url) description, job_type = LinkedInScraper.get_description(job_url)
if datetime_tag: if datetime_tag:
datetime_str = datetime_tag["datetime"] datetime_str = datetime_tag["datetime"]
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d") try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except Exception as e:
date_posted = None
else: else:
date_posted = None date_posted = None
@@ -130,15 +138,13 @@ class LinkedInScraper(Scraper):
), ),
) )
job_list.append(job_post) job_list.append(job_post)
if ( if processed_jobs >= job_count:
len(job_list) >= scraper_input.results_wanted
or processed_jobs >= job_count
):
break break
if ( if len(job_list) >= scraper_input.results_wanted:
len(job_list) >= scraper_input.results_wanted break
or processed_jobs >= job_count if processed_jobs >= job_count:
): break
if len(job_list) >= scraper_input.results_wanted:
break break
page += 1 page += 1
@@ -158,7 +164,11 @@ class LinkedInScraper(Scraper):
:param job_page_url: :param job_page_url:
:return: description or None :return: description or None
""" """
response = requests.get(job_page_url, allow_redirects=True) try:
response = requests.get(job_page_url, timeout=5)
except Timeout:
return None, None
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None, None return None, None

View File

@@ -148,7 +148,6 @@ class ZipRecruiterScraper(Scraper):
error=f"ZipRecruiter returned status code {e.status_code}", error=f"ZipRecruiter returned status code {e.status_code}",
) )
except Exception as e: except Exception as e:
print(f"ZipRecruiter failed to scrape: {e}\n{traceback.format_exc()}")
return JobResponse( return JobResponse(
success=False, success=False,
error=f"ZipRecruiter failed to scrape: {e}", error=f"ZipRecruiter failed to scrape: {e}",
@@ -302,7 +301,6 @@ class ZipRecruiterScraper(Scraper):
timeout_seconds=5, timeout_seconds=5,
) )
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
print("The request timed out.")
return None return None
html_string = response.content html_string = response.content

10
src/tests/test_all.py Normal file
View File

@@ -0,0 +1,10 @@
from ..jobspy import scrape_jobs
def test_all():
result = scrape_jobs(
site_name=["linkedin", "indeed", "zip_recruiter"],
search_term="software engineer",
results_wanted=5,
)
assert result is not None and result.errors.empty is True

View File

@@ -6,4 +6,4 @@ def test_indeed():
site_name="indeed", site_name="indeed",
search_term="software engineer", search_term="software engineer",
) )
assert result is not None assert result is not None and result.errors.empty is True

View File

@@ -1,4 +1,4 @@
from jobspy import scrape_jobs from ..jobspy import scrape_jobs
def test_linkedin(): def test_linkedin():
@@ -6,4 +6,4 @@ def test_linkedin():
site_name="linkedin", site_name="linkedin",
search_term="software engineer", search_term="software engineer",
) )
assert result is not None assert result is not None and result.errors.empty is True

View File

@@ -1,4 +1,4 @@
from jobspy import scrape_jobs from ..jobspy import scrape_jobs
def test_ziprecruiter(): def test_ziprecruiter():
@@ -7,4 +7,4 @@ def test_ziprecruiter():
search_term="software engineer", search_term="software engineer",
) )
assert result is not None assert result is not None and result.errors.empty is True