diff --git a/pyproject.toml b/pyproject.toml index ed7c8f9..12a694d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.26" +version = "1.1.27" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 922e671..5fcc696 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -16,9 +16,9 @@ from threading import Lock from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site -from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type +from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser from ..exceptions import LinkedInException -from ...jobs import JobPost, Location, JobResponse, JobType, Country +from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation class LinkedInScraper(Scraper): @@ -135,6 +135,22 @@ class LinkedInScraper(Scraper): return JobResponse(jobs=job_list) def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: + salary_tag = job_card.find('span', class_='job-search-card__salary-info') + + compensation = None + if salary_tag: + salary_text = salary_tag.get_text(separator=' ').strip() + salary_values = [currency_parser(value) for value in salary_text.split('-')] + salary_min = salary_values[0] + salary_max = salary_values[1] + currency = salary_text[0] if salary_text[0] != '$' else 'USD' + + compensation = Compensation( + min_amount=int(salary_min), + max_amount=int(salary_max), + currency=currency, + ) + title_tag = job_card.find("span", class_="sr-only") title = title_tag.get_text(strip=True) if title_tag else "N/A" @@ -177,6 +193,7 @@ class LinkedInScraper(Scraper): date_posted=date_posted, job_url=job_url, job_type=job_type, + compensation=compensation, benefits=benefits, emails=extract_emails_from_text(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None, diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 5e5ffb0..c44b875 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -1,4 +1,5 @@ import re +import numpy as np import requests import tls_client @@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None: if job_type_str in job_type.value: res = job_type return res + +def currency_parser(cur_str): + # Remove any non-numerical characters + # except for ',' '.' or '-' (e.g. EUR) + cur_str = re.sub("[^-0-9.,]", '', cur_str) + # Remove any 000s separators (either , or .) + cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:] + + if '.' in list(cur_str[-3:]): + num = float(cur_str) + elif ',' in list(cur_str[-3:]): + num = float(cur_str.replace(',', '.')) + else: + num = float(cur_str) + + return np.round(num, 2)