fix linkedin bug & add linkedin company url (#67)

pull/68/head v1.1.26
Cullen Watson 2023-11-08 15:51:07 -06:00 committed by GitHub
parent a2c8fe046e
commit cc9e7866b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 46 additions and 34 deletions

View File

@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
```plaintext ```plaintext
Required Required
├── site_type (List[enum]): linkedin, zip_recruiter, indeed ├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
└── search_term (str) └── search_term (str)
Optional Optional
├── location (int) ├── location (int)
@ -107,21 +107,22 @@ The following exceptions may be raised when using JobSpy:
* `LinkedInException` * `LinkedInException`
* `IndeedException` * `IndeedException`
* `ZipRecruiterException` * `ZipRecruiterException`
* `GlassdoorException`
## Supported Countries for Job Searching ## Supported Countries for Job Searching
### **LinkedIn** ### **LinkedIn**
LinkedIn searches globally & uses only the `location` parameter. LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
### **ZipRecruiter** ### **ZipRecruiter**
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter. ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
### **Indeed** ### **Indeed / Glassdoor**
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
parameter to narrow down the location, e.g. city & state if necessary. parameter to narrow down the location, e.g. city & state if necessary.
You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor): You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor):
@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact
| Venezuela | Vietnam | | | | Venezuela | Vietnam | | |
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
## Frequently Asked Questions ## Frequently Asked Questions
--- ---

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.25" version = "1.1.26"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -163,6 +163,7 @@ def scrape_jobs(
"site", "site",
"title", "title",
"company", "company",
"company_url",
"location", "location",
"job_type", "job_type",
"date_posted", "date_posted",

View File

@ -196,6 +196,8 @@ class JobPost(BaseModel):
location: Optional[Location] location: Optional[Location]
description: str | None = None description: str | None = None
company_url: str | None = None
job_type: list[JobType] | None = None job_type: list[JobType] | None = None
compensation: Compensation | None = None compensation: Compensation | None = None
date_posted: date | None = None date_posted: date | None = None

View File

@ -10,10 +10,10 @@ from datetime import datetime
import requests import requests
import time import time
from requests.exceptions import ProxyError from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from threading import Lock from threading import Lock
from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
if scraper_input.job_type if scraper_input.job_type
else None, else None,
"pageNum": 0, "pageNum": 0,
page: page + scraper_input.offset, "start": page + scraper_input.offset,
"f_AL": "true" if scraper_input.easy_apply else None, "f_AL": "true" if scraper_input.easy_apply else None,
} }
params = {k: v for k, v in params.items() if v is not None}
params = {k: v for k, v in params.items() if v is not None} params = {k: v for k, v in params.items() if v is not None}
retries = 0 retries = 0
while retries < self.MAX_RETRIES: while retries < self.MAX_RETRIES:
@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
break break
except requests.HTTPError as e: except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None: if hasattr(e, "response") and e.response is not None:
if e.response.status_code == 429: if e.response.status_code in (429, 502):
time.sleep(self.DELAY) time.sleep(self.DELAY)
retries += 1 retries += 1
continue continue
@ -110,32 +108,27 @@ class LinkedInScraper(Scraper):
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
with ThreadPoolExecutor(max_workers=5) as executor: for job_card in soup.find_all("div", class_="base-search-card"):
futures = [] job_url = None
for job_card in soup.find_all("div", class_="base-search-card"): href_tag = job_card.find("a", class_="base-card__full-link")
job_url = None if href_tag and "href" in href_tag.attrs:
href_tag = job_card.find("a", class_="base-card__full-link") href = href_tag.attrs["href"].split("?")[0]
if href_tag and "href" in href_tag.attrs: job_id = href.split("-")[-1]
href = href_tag.attrs["href"].split("?")[0] job_url = f"{self.url}/jobs/view/{job_id}"
job_id = href.split("-")[-1]
job_url = f"{self.url}/jobs/view/{job_id}"
with url_lock: with url_lock:
if job_url in seen_urls: if job_url in seen_urls:
continue continue
seen_urls.add(job_url) seen_urls.add(job_url)
futures.append(executor.submit(self.process_job, job_card, job_url)) # Call process_job directly without threading
try:
job_post = self.process_job(job_card, job_url)
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException("Exception occurred while processing jobs")
for future in as_completed(futures):
try:
job_post = future.result()
if job_post:
job_list.append(job_post)
except Exception as e:
raise LinkedInException(
"Exception occurred while processing jobs"
)
page += 25 page += 25
job_list = job_list[: scraper_input.results_wanted] job_list = job_list[: scraper_input.results_wanted]
@ -147,6 +140,11 @@ class LinkedInScraper(Scraper):
company_tag = job_card.find("h4", class_="base-search-card__subtitle") company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find("div", class_="base-search-card__metadata") metadata_card = job_card.find("div", class_="base-search-card__metadata")
@ -168,11 +166,13 @@ class LinkedInScraper(Scraper):
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
description, job_type = self.get_job_description(job_url) description, job_type = self.get_job_description(job_url)
# description, job_type = None, []
return JobPost( return JobPost(
title=title, title=title,
description=description, description=description,
company_name=company, company_name=company,
company_url=company_url,
location=location, location=location,
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
@ -193,8 +193,15 @@ class LinkedInScraper(Scraper):
try: try:
response = requests.get(job_page_url, timeout=5, proxies=self.proxy) response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
response.raise_for_status() response.raise_for_status()
except requests.HTTPError as e:
if hasattr(e, "response") and e.response is not None:
if e.response.status_code in (429, 502):
time.sleep(self.DELAY)
return None, None
except Exception as e: except Exception as e:
return None, None return None, None
if response.url == "https://www.linkedin.com/signup":
return None, None
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find( div_content = soup.find(
@ -230,7 +237,7 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower() employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "") employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] return [get_enum_from_job_type(employment_type)] if employment_type else []
return description, get_job_type(soup) return description, get_job_type(soup)