Merge pull request #1 from Bunsly/main

recent changes from main
pull/91/head
WillBlears 2023-11-16 17:26:48 -05:00 committed by GitHub
commit 0bcfd6a6f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 90 additions and 37 deletions

View File

@ -4,7 +4,7 @@
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
work with us.*
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** a Python package
@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext
Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str)
Optional
├── location (int)
@ -107,18 +107,19 @@ The following exceptions may be raised when using JobSpy:
* `LinkedInException`
* `IndeedException`
* `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching
### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter.
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed**
### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary.
@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Frequently Asked Questions
---

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.25"
version = "1.1.28"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -163,6 +163,7 @@ def scrape_jobs(
"site",
"title",
"company",
"company_url",
"location",
"job_type",
"date_posted",

View File

@ -196,6 +196,8 @@ class JobPost(BaseModel):
location: Optional[Location]
description: str | None = None
company_url: str | None = None
job_type: list[JobType] | None = None
compensation: Compensation | None = None
date_posted: date | None = None

View File

@ -10,15 +10,15 @@ from datetime import datetime
import requests
import time
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
from bs4.element import Tag
from threading import Lock
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException
from ...jobs import JobPost, Location, JobResponse, JobType, Country
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
class LinkedInScraper(Scraper):
@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
if scraper_input.job_type
else None,
"pageNum": 0,
page: page + scraper_input.offset,
"start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None,
}
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None}
retries = 0
while retries < self.MAX_RETRIES:
@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
break
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
retries += 1
continue
@ -110,8 +108,6 @@ class LinkedInScraper(Scraper):
soup = BeautifulSoup(response.text, "html.parser")
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for job_card in soup.find_all("div", class_="base-search-card"):
job_url = None
href_tag = job_card.find("a", class_="base-card__full-link")
@ -125,28 +121,46 @@ class LinkedInScraper(Scraper):
continue
seen_urls.add(job_url)
futures.append(executor.submit(self.process_job, job_card, job_url))
for future in as_completed(futures):
# Call process_job directly without threading
try:
job_post = future.result()
job_post = self.process_job(job_card, job_url)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException(
"Exception occurred while processing jobs"
)
raise LinkedInException("Exception occurred while processing jobs")
page += 25
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=' ').strip()
salary_values = [currency_parser(value) for value in salary_text.split('-')]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata")
@ -168,15 +182,18 @@ class LinkedInScraper(Scraper):
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url)
# description, job_type = None, []
return JobPost(
title=title,
description=description,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
job_url=job_url,
job_type=job_type,
compensation=compensation,
benefits=benefits,
emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None,
@ -193,8 +210,15 @@ class LinkedInScraper(Scraper):
try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status()
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None
except Exception as e:
return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
@ -230,7 +254,7 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)]
return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup)
@ -254,5 +278,12 @@ class LinkedInScraper(Scraper):
state=state,
country=Country.from_string(self.country),
)
elif len(parts) == 3:
city, state, country = parts
location = Location(
city=city,
state=state,
country=Country.from_string(country),
)
return location

View File

@ -1,4 +1,5 @@
import re
import numpy as np
import requests
import tls_client
@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
if job_type_str in job_type.value:
res = job_type
return res
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)