From a2c8fe046e03ec4e854dc1fc070bfb09a5d2125d Mon Sep 17 00:00:00 2001 From: Zachary Hampton Date: Mon, 6 Nov 2023 22:13:19 -0700 Subject: [PATCH 1/4] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4238921..73e60af 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ **Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com). -*Looking to build a data-focused software product?* **[Book a call](https://calendly.com/bunsly/15min)** *to +*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to work with us.* Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** – a Python package From cc9e7866b7db273d527809b98f691bd46f6beb03 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Wed, 8 Nov 2023 15:51:07 -0600 Subject: [PATCH 2/4] fix linkedin bug & add linkedin company url (#67) --- README.md | 10 ++-- pyproject.toml | 2 +- src/jobspy/__init__.py | 1 + src/jobspy/jobs/__init__.py | 2 + src/jobspy/scrapers/linkedin/__init__.py | 65 +++++++++++++----------- 5 files changed, 46 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 73e60af..6de1dc6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ zip_recruiter Software Developer TEKsystems Phoenix ```plaintext Required -├── site_type (List[enum]): linkedin, zip_recruiter, indeed +├── site_type (List[enum]): linkedin, zip_recruiter, indeed, glassdoor └── search_term (str) Optional ├── location (int) @@ -107,21 +107,22 @@ The following exceptions may be raised when using JobSpy: * `LinkedInException` * `IndeedException` * `ZipRecruiterException` +* `GlassdoorException` ## Supported Countries for Job Searching ### **LinkedIn** -LinkedIn searches globally & uses only the `location` parameter. +LinkedIn searches globally & uses only the `location` parameter. You can only fetch 1000 jobs max from the LinkedIn endpoint we're using ### **ZipRecruiter** ZipRecruiter searches for jobs in **US/Canada** & uses only the `location` parameter. -### **Indeed** +### **Indeed / Glassdoor** Indeed & Glassdoor supports most countries, but the `country_indeed` parameter is required. Additionally, use the `location` -parameter to narrow down the location, e.g. city & state if necessary. +parameter to narrow down the location, e.g. city & state if necessary. You can specify the following countries when searching on Indeed (use the exact name, * indicates support for Glassdoor): @@ -145,6 +146,7 @@ You can specify the following countries when searching on Indeed (use the exact | Venezuela | Vietnam | | | +Glassdoor can only fetch 900 jobs from the endpoint we're using on a given search. ## Frequently Asked Questions --- diff --git a/pyproject.toml b/pyproject.toml index e5c4a14..ed7c8f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.25" +version = "1.1.26" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 7c5fa64..8214a1f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -163,6 +163,7 @@ def scrape_jobs( "site", "title", "company", + "company_url", "location", "job_type", "date_posted", diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index 0737824..f1fd708 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -196,6 +196,8 @@ class JobPost(BaseModel): location: Optional[Location] description: str | None = None + company_url: str | None = None + job_type: list[JobType] | None = None compensation: Compensation | None = None date_posted: date | None = None diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 26d4390..922e671 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -10,10 +10,10 @@ from datetime import datetime import requests import time from requests.exceptions import ProxyError -from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup from bs4.element import Tag from threading import Lock +from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type @@ -66,12 +66,10 @@ class LinkedInScraper(Scraper): if scraper_input.job_type else None, "pageNum": 0, - page: page + scraper_input.offset, + "start": page + scraper_input.offset, "f_AL": "true" if scraper_input.easy_apply else None, } - params = {k: v for k, v in params.items() if v is not None} - params = {k: v for k, v in params.items() if v is not None} retries = 0 while retries < self.MAX_RETRIES: @@ -88,7 +86,7 @@ class LinkedInScraper(Scraper): break except requests.HTTPError as e: if hasattr(e, "response") and e.response is not None: - if e.response.status_code == 429: + if e.response.status_code in (429, 502): time.sleep(self.DELAY) retries += 1 continue @@ -110,32 +108,27 @@ class LinkedInScraper(Scraper): soup = BeautifulSoup(response.text, "html.parser") - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [] - for job_card in soup.find_all("div", class_="base-search-card"): - job_url = None - href_tag = job_card.find("a", class_="base-card__full-link") - if href_tag and "href" in href_tag.attrs: - href = href_tag.attrs["href"].split("?")[0] - job_id = href.split("-")[-1] - job_url = f"{self.url}/jobs/view/{job_id}" + for job_card in soup.find_all("div", class_="base-search-card"): + job_url = None + href_tag = job_card.find("a", class_="base-card__full-link") + if href_tag and "href" in href_tag.attrs: + href = href_tag.attrs["href"].split("?")[0] + job_id = href.split("-")[-1] + job_url = f"{self.url}/jobs/view/{job_id}" - with url_lock: - if job_url in seen_urls: - continue - seen_urls.add(job_url) + with url_lock: + if job_url in seen_urls: + continue + seen_urls.add(job_url) - futures.append(executor.submit(self.process_job, job_card, job_url)) + # Call process_job directly without threading + try: + job_post = self.process_job(job_card, job_url) + if job_post: + job_list.append(job_post) + except Exception as e: + raise LinkedInException("Exception occurred while processing jobs") - for future in as_completed(futures): - try: - job_post = future.result() - if job_post: - job_list.append(job_post) - except Exception as e: - raise LinkedInException( - "Exception occurred while processing jobs" - ) page += 25 job_list = job_list[: scraper_input.results_wanted] @@ -147,6 +140,11 @@ class LinkedInScraper(Scraper): company_tag = job_card.find("h4", class_="base-search-card__subtitle") company_a_tag = company_tag.find("a") if company_tag else None + company_url = ( + urlunparse(urlparse(company_a_tag.get("href"))._replace(query="")) + if company_a_tag and company_a_tag.has_attr("href") + else "" + ) company = company_a_tag.get_text(strip=True) if company_a_tag else "N/A" metadata_card = job_card.find("div", class_="base-search-card__metadata") @@ -168,11 +166,13 @@ class LinkedInScraper(Scraper): benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None description, job_type = self.get_job_description(job_url) + # description, job_type = None, [] return JobPost( title=title, description=description, company_name=company, + company_url=company_url, location=location, date_posted=date_posted, job_url=job_url, @@ -193,8 +193,15 @@ class LinkedInScraper(Scraper): try: response = requests.get(job_page_url, timeout=5, proxies=self.proxy) response.raise_for_status() + except requests.HTTPError as e: + if hasattr(e, "response") and e.response is not None: + if e.response.status_code in (429, 502): + time.sleep(self.DELAY) + return None, None except Exception as e: return None, None + if response.url == "https://www.linkedin.com/signup": + return None, None soup = BeautifulSoup(response.text, "html.parser") div_content = soup.find( @@ -230,7 +237,7 @@ class LinkedInScraper(Scraper): employment_type = employment_type.lower() employment_type = employment_type.replace("-", "") - return [get_enum_from_job_type(employment_type)] + return [get_enum_from_job_type(employment_type)] if employment_type else [] return description, get_job_type(soup) From 81f70ff8a577c2da1eb53a13361420146f30df04 Mon Sep 17 00:00:00 2001 From: Faraz Khan Date: Fri, 10 Nov 2023 01:57:15 +0500 Subject: [PATCH 3/4] added salary data for linkedin (#68) --- pyproject.toml | 2 +- src/jobspy/scrapers/linkedin/__init__.py | 21 +++++++++++++++++++-- src/jobspy/scrapers/utils.py | 17 +++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed7c8f9..12a694d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.26" +version = "1.1.27" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 922e671..5fcc696 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -16,9 +16,9 @@ from threading import Lock from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site -from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type +from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser from ..exceptions import LinkedInException -from ...jobs import JobPost, Location, JobResponse, JobType, Country +from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation class LinkedInScraper(Scraper): @@ -135,6 +135,22 @@ class LinkedInScraper(Scraper): return JobResponse(jobs=job_list) def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: + salary_tag = job_card.find('span', class_='job-search-card__salary-info') + + compensation = None + if salary_tag: + salary_text = salary_tag.get_text(separator=' ').strip() + salary_values = [currency_parser(value) for value in salary_text.split('-')] + salary_min = salary_values[0] + salary_max = salary_values[1] + currency = salary_text[0] if salary_text[0] != '$' else 'USD' + + compensation = Compensation( + min_amount=int(salary_min), + max_amount=int(salary_max), + currency=currency, + ) + title_tag = job_card.find("span", class_="sr-only") title = title_tag.get_text(strip=True) if title_tag else "N/A" @@ -177,6 +193,7 @@ class LinkedInScraper(Scraper): date_posted=date_posted, job_url=job_url, job_type=job_type, + compensation=compensation, benefits=benefits, emails=extract_emails_from_text(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None, diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 5e5ffb0..c44b875 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,4 +1,5 @@ import re +import numpy as np import requests import tls_client @@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None: if job_type_str in job_type.value: res = job_type return res + +def currency_parser(cur_str): + # Remove any non-numerical characters + # except for ',' '.' or '-' (e.g. EUR) + cur_str = re.sub("[^-0-9.,]", '', cur_str) + # Remove any 000s separators (either , or .) + cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:] + + if '.' in list(cur_str[-3:]): + num = float(cur_str) + elif ',' in list(cur_str[-3:]): + num = float(cur_str.replace(',', '.')) + else: + num = float(cur_str) + + return np.round(num, 2) From dfb8c18c518496009fb6210e7e526e5358c720f8 Mon Sep 17 00:00:00 2001 From: Faraz Khan Date: Sat, 11 Nov 2023 03:59:42 +0500 Subject: [PATCH 4/4] include location with 3 parts (#69) --- pyproject.toml | 2 +- src/jobspy/scrapers/linkedin/__init__.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 12a694d..08272d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.27" +version = "1.1.28" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 5fcc696..67d2898 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -278,5 +278,12 @@ class LinkedInScraper(Scraper): state=state, country=Country.from_string(self.country), ) + elif len(parts) == 3: + city, state, country = parts + location = Location( + city=city, + state=state, + country=Country.from_string(country), + ) return location