From cc9e7866b7db273d527809b98f691bd46f6beb03 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 8 Nov 2023 15:51:07 -0600 Subject: [PATCH] fix linkedin bug & add linkedin company url (#67) --- README.md | 10 ++-- pyproject.toml | 2 +- src/jobspy/__init__.py | 1 + src/jobspy/jobs/__init__.py | 2 + src/jobspy/scrapers/linkedin/__init__.py | 65 +++++++++++++----------- 5 files changed, 46 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 73e60af..6de1dc6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix ```plaintext Required -├── site_type (List[enum]): linkedin, zip_recruiter, indeed +├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor └── search_term (str) Optional ├── location (int) @@ -107,21 +107,22 @@ The following exceptions may be raised when using JobSpy: * `LinkedInException` * `IndeedException` * `ZipRecruiterException` +* `GlassdoorException` ## Supported Countries for Job Searching ### **LinkedIn** -LinkedIn searches globally & uses only the `location` parameter. +LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using ### **ZipRecruiter** ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter. -### **Indeed** +### **Indeed / Glassdoor** Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` -parameter to narrow down the location, e.g. city & state if necessary. +parameter to narrow down the location, e.g. city & state if necessary. You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor): @@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact | Venezuela | Vietnam | | | +Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search. ## Frequently Asked Questions --- diff --git a/pyproject.toml b/pyproject.toml index e5c4a14..ed7c8f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.25" +version = "1.1.26" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 7c5fa64..8214a1f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -163,6 +163,7 @@ def scrape_jobs( "site", "title", "company", + "company_url", "location", "job_type", "date_posted", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 0737824..f1fd708 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -196,6 +196,8 @@ class JobPost(BaseModel): location: Optional[Location] description: str | None = None + company_url: str | None = None + job_type: list[JobType] | None = None compensation: Compensation | None = None date_posted: date | None = None diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 26d4390..922e671 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -10,10 +10,10 @@ from datetime import datetime import requests import time from requests.exceptions import ProxyError -from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup from bs4.element import Tag from threading import Lock +from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type @@ -66,12 +66,10 @@ class LinkedInScraper(Scraper): if scraper_input.job_type else None, "pageNum": 0, - page: page + scraper_input.offset, + "start": page + scraper_input.offset, "f_AL": "true" if scraper_input.easy_apply else None, } - params = {k: v for k, v in params.items() if v is not None} - params = {k: v for k, v in params.items() if v is not None} retries = 0 while retries < self.MAX_RETRIES: @@ -88,7 +86,7 @@ class LinkedInScraper(Scraper): break except requests.HTTPError as e: if hasattr(e, "response") and e.response is not None: - if e.response.status_code == 429: + if e.response.status_code in (429, 502): time.sleep(self.DELAY) retries += 1 continue @@ -110,32 +108,27 @@ class LinkedInScraper(Scraper): soup = BeautifulSoup(response.text, "html.parser") - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [] - for job_card in soup.find_all("div", class_="base-search-card"): - job_url = None - href_tag = job_card.find("a", class_="base-card__full-link") - if href_tag and "href" in href_tag.attrs: - href = href_tag.attrs["href"].split("?")[0] - job_id = href.split("-")[-1] - job_url = f"{self.url}/jobs/view/{job_id}" + for job_card in soup.find_all("div", class_="base-search-card"): + job_url = None + href_tag = job_card.find("a", class_="base-card__full-link") + if href_tag and "href" in href_tag.attrs: + href = href_tag.attrs["href"].split("?")[0] + job_id = href.split("-")[-1] + job_url = f"{self.url}/jobs/view/{job_id}" - with url_lock: - if job_url in seen_urls: - continue - seen_urls.add(job_url) + with url_lock: + if job_url in seen_urls: + continue + seen_urls.add(job_url) - futures.append(executor.submit(self.process_job, job_card, job_url)) + # Call process_job directly without threading + try: + job_post = self.process_job(job_card, job_url) + if job_post: + job_list.append(job_post) + except Exception as e: + raise LinkedInException("Exception occurred while processing jobs") - for future in as_completed(futures): - try: - job_post = future.result() - if job_post: - job_list.append(job_post) - except Exception as e: - raise LinkedInException( - "Exception occurred while processing jobs" - ) page += 25 job_list = job_list[: scraper_input.results_wanted] @@ -147,6 +140,11 @@ class LinkedInScraper(Scraper): company_tag = job_card.find("h4", class_="base-search-card__subtitle") company_a_tag = company_tag.find("a") if company_tag else None + company_url = ( + urlunparse(urlparse(company_a_tag.get("href"))._replace(query="")) + if company_a_tag and company_a_tag.has_attr("href") + else "" + ) company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" metadata_card = job_card.find("div", class_="base-search-card__metadata") @@ -168,11 +166,13 @@ class LinkedInScraper(Scraper): benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None description, job_type = self.get_job_description(job_url) + # description, job_type = None, [] return JobPost( title=title, description=description, company_name=company, + company_url=company_url, location=location, date_posted=date_posted, job_url=job_url, @@ -193,8 +193,15 @@ class LinkedInScraper(Scraper): try: response = requests.get(job_page_url, timeout=5, proxies=self.proxy) response.raise_for_status() + except requests.HTTPError as e: + if hasattr(e, "response") and e.response is not None: + if e.response.status_code in (429, 502): + time.sleep(self.DELAY) + return None, None except Exception as e: return None, None + if response.url == "https://www.linkedin.com/signup": + return None, None soup = BeautifulSoup(response.text, "html.parser") div_content = soup.find( @@ -230,7 +237,7 @@ class LinkedInScraper(Scraper): employment_type = employment_type.lower() employment_type = employment_type.replace("-", "") - return [get_enum_from_job_type(employment_type)] + return [get_enum_from_job_type(employment_type)] if employment_type else [] return description, get_job_type(soup)