added salary data for linkedin (#68)

pull/69/head v1.1.27
Faraz Khan 2023-11-10 01:57:15 +05:00 committed by GitHub
parent cc9e7866b7
commit 81f70ff8a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 3 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "python-jobspy" name = "python-jobspy"
version = "1.1.26" version = "1.1.27"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"

View File

@ -16,9 +16,9 @@ from threading import Lock
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ...jobs import JobPost, Location, JobResponse, JobType, Country from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
class LinkedInScraper(Scraper): class LinkedInScraper(Scraper):
@ -135,6 +135,22 @@ class LinkedInScraper(Scraper):
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]: def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=' ').strip()
salary_values = [currency_parser(value) for value in salary_text.split('-')]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only") title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A" title = title_tag.get_text(strip=True) if title_tag else "N/A"
@ -177,6 +193,7 @@ class LinkedInScraper(Scraper):
date_posted=date_posted, date_posted=date_posted,
job_url=job_url, job_url=job_url,
job_type=job_type, job_type=job_type,
compensation=compensation,
benefits=benefits, benefits=benefits,
emails=extract_emails_from_text(description) if description else None, emails=extract_emails_from_text(description) if description else None,
num_urgent_words=count_urgent_words(description) if description else None, num_urgent_words=count_urgent_words(description) if description else None,

View File

@ -1,4 +1,5 @@
import re import re
import numpy as np
import requests import requests
import tls_client import tls_client
@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
if job_type_str in job_type.value: if job_type_str in job_type.value:
res = job_type res = job_type
return res return res
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)