mirror of https://github.com/Bunsly/JobSpy
parent
a2c8fe046e
commit
cc9e7866b7
|
@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix
|
||||||
|
|
||||||
```plaintext
|
```plaintext
|
||||||
Required
|
Required
|
||||||
├── site_type (List[enum]): linkedin, zip_recruiter, indeed
|
├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor
|
||||||
└── search_term (str)
|
└── search_term (str)
|
||||||
Optional
|
Optional
|
||||||
├── location (int)
|
├── location (int)
|
||||||
|
@ -107,18 +107,19 @@ The following exceptions may be raised when using JobSpy:
|
||||||
* `LinkedInException`
|
* `LinkedInException`
|
||||||
* `IndeedException`
|
* `IndeedException`
|
||||||
* `ZipRecruiterException`
|
* `ZipRecruiterException`
|
||||||
|
* `GlassdoorException`
|
||||||
|
|
||||||
## Supported Countries for Job Searching
|
## Supported Countries for Job Searching
|
||||||
|
|
||||||
### **LinkedIn**
|
### **LinkedIn**
|
||||||
|
|
||||||
LinkedIn searches globally & uses only the `location` parameter.
|
LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using
|
||||||
|
|
||||||
### **ZipRecruiter**
|
### **ZipRecruiter**
|
||||||
|
|
||||||
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
|
ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter.
|
||||||
|
|
||||||
### **Indeed**
|
### **Indeed / Glassdoor**
|
||||||
|
|
||||||
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
|
Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location`
|
||||||
parameter to narrow down the location, e.g. city & state if necessary.
|
parameter to narrow down the location, e.g. city & state if necessary.
|
||||||
|
@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact
|
||||||
| Venezuela | Vietnam | | |
|
| Venezuela | Vietnam | | |
|
||||||
|
|
||||||
|
|
||||||
|
Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search.
|
||||||
## Frequently Asked Questions
|
## Frequently Asked Questions
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.25"
|
version = "1.1.26"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -163,6 +163,7 @@ def scrape_jobs(
|
||||||
"site",
|
"site",
|
||||||
"title",
|
"title",
|
||||||
"company",
|
"company",
|
||||||
|
"company_url",
|
||||||
"location",
|
"location",
|
||||||
"job_type",
|
"job_type",
|
||||||
"date_posted",
|
"date_posted",
|
||||||
|
|
|
@ -196,6 +196,8 @@ class JobPost(BaseModel):
|
||||||
location: Optional[Location]
|
location: Optional[Location]
|
||||||
|
|
||||||
description: str | None = None
|
description: str | None = None
|
||||||
|
company_url: str | None = None
|
||||||
|
|
||||||
job_type: list[JobType] | None = None
|
job_type: list[JobType] | None = None
|
||||||
compensation: Compensation | None = None
|
compensation: Compensation | None = None
|
||||||
date_posted: date | None = None
|
date_posted: date | None = None
|
||||||
|
|
|
@ -10,10 +10,10 @@ from datetime import datetime
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from requests.exceptions import ProxyError
|
from requests.exceptions import ProxyError
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
|
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
|
||||||
|
@ -66,12 +66,10 @@ class LinkedInScraper(Scraper):
|
||||||
if scraper_input.job_type
|
if scraper_input.job_type
|
||||||
else None,
|
else None,
|
||||||
"pageNum": 0,
|
"pageNum": 0,
|
||||||
page: page + scraper_input.offset,
|
"start": page + scraper_input.offset,
|
||||||
"f_AL": "true" if scraper_input.easy_apply else None,
|
"f_AL": "true" if scraper_input.easy_apply else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
|
||||||
|
|
||||||
params = {k: v for k, v in params.items() if v is not None}
|
params = {k: v for k, v in params.items() if v is not None}
|
||||||
retries = 0
|
retries = 0
|
||||||
while retries < self.MAX_RETRIES:
|
while retries < self.MAX_RETRIES:
|
||||||
|
@ -88,7 +86,7 @@ class LinkedInScraper(Scraper):
|
||||||
break
|
break
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
if hasattr(e, "response") and e.response is not None:
|
if hasattr(e, "response") and e.response is not None:
|
||||||
if e.response.status_code == 429:
|
if e.response.status_code in (429, 502):
|
||||||
time.sleep(self.DELAY)
|
time.sleep(self.DELAY)
|
||||||
retries += 1
|
retries += 1
|
||||||
continue
|
continue
|
||||||
|
@ -110,32 +108,27 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=5) as executor:
|
for job_card in soup.find_all("div", class_="base-search-card"):
|
||||||
futures = []
|
job_url = None
|
||||||
for job_card in soup.find_all("div", class_="base-search-card"):
|
href_tag = job_card.find("a", class_="base-card__full-link")
|
||||||
job_url = None
|
if href_tag and "href" in href_tag.attrs:
|
||||||
href_tag = job_card.find("a", class_="base-card__full-link")
|
href = href_tag.attrs["href"].split("?")[0]
|
||||||
if href_tag and "href" in href_tag.attrs:
|
job_id = href.split("-")[-1]
|
||||||
href = href_tag.attrs["href"].split("?")[0]
|
job_url = f"{self.url}/jobs/view/{job_id}"
|
||||||
job_id = href.split("-")[-1]
|
|
||||||
job_url = f"{self.url}/jobs/view/{job_id}"
|
|
||||||
|
|
||||||
with url_lock:
|
with url_lock:
|
||||||
if job_url in seen_urls:
|
if job_url in seen_urls:
|
||||||
continue
|
continue
|
||||||
seen_urls.add(job_url)
|
seen_urls.add(job_url)
|
||||||
|
|
||||||
futures.append(executor.submit(self.process_job, job_card, job_url))
|
# Call process_job directly without threading
|
||||||
|
try:
|
||||||
|
job_post = self.process_job(job_card, job_url)
|
||||||
|
if job_post:
|
||||||
|
job_list.append(job_post)
|
||||||
|
except Exception as e:
|
||||||
|
raise LinkedInException("Exception occurred while processing jobs")
|
||||||
|
|
||||||
for future in as_completed(futures):
|
|
||||||
try:
|
|
||||||
job_post = future.result()
|
|
||||||
if job_post:
|
|
||||||
job_list.append(job_post)
|
|
||||||
except Exception as e:
|
|
||||||
raise LinkedInException(
|
|
||||||
"Exception occurred while processing jobs"
|
|
||||||
)
|
|
||||||
page += 25
|
page += 25
|
||||||
|
|
||||||
job_list = job_list[: scraper_input.results_wanted]
|
job_list = job_list[: scraper_input.results_wanted]
|
||||||
|
@ -147,6 +140,11 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
|
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
|
||||||
company_a_tag = company_tag.find("a") if company_tag else None
|
company_a_tag = company_tag.find("a") if company_tag else None
|
||||||
|
company_url = (
|
||||||
|
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
|
||||||
|
if company_a_tag and company_a_tag.has_attr("href")
|
||||||
|
else ""
|
||||||
|
)
|
||||||
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
|
company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A"
|
||||||
|
|
||||||
metadata_card = job_card.find("div", class_="base-search-card__metadata")
|
metadata_card = job_card.find("div", class_="base-search-card__metadata")
|
||||||
|
@ -168,11 +166,13 @@ class LinkedInScraper(Scraper):
|
||||||
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
||||||
|
|
||||||
description, job_type = self.get_job_description(job_url)
|
description, job_type = self.get_job_description(job_url)
|
||||||
|
# description, job_type = None, []
|
||||||
|
|
||||||
return JobPost(
|
return JobPost(
|
||||||
title=title,
|
title=title,
|
||||||
description=description,
|
description=description,
|
||||||
company_name=company,
|
company_name=company,
|
||||||
|
company_url=company_url,
|
||||||
location=location,
|
location=location,
|
||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
|
@ -193,8 +193,15 @@ class LinkedInScraper(Scraper):
|
||||||
try:
|
try:
|
||||||
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
|
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
if hasattr(e, "response") and e.response is not None:
|
||||||
|
if e.response.status_code in (429, 502):
|
||||||
|
time.sleep(self.DELAY)
|
||||||
|
return None, None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, None
|
return None, None
|
||||||
|
if response.url == "https://www.linkedin.com/signup":
|
||||||
|
return None, None
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
div_content = soup.find(
|
div_content = soup.find(
|
||||||
|
@ -230,7 +237,7 @@ class LinkedInScraper(Scraper):
|
||||||
employment_type = employment_type.lower()
|
employment_type = employment_type.lower()
|
||||||
employment_type = employment_type.replace("-", "")
|
employment_type = employment_type.replace("-", "")
|
||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)]
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
||||||
|
|
||||||
return description, get_job_type(soup)
|
return description, get_job_type(soup)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue