mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 12:04:33 -08:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eed7fca300 | ||
|
|
dfb8c18c51 | ||
|
|
81f70ff8a5 |
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.26"
|
version = "1.1.29"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ class Country(Enum):
|
|||||||
# internal for ziprecruiter
|
# internal for ziprecruiter
|
||||||
US_CANADA = ("usa/ca", "www")
|
US_CANADA = ("usa/ca", "www")
|
||||||
|
|
||||||
# internal for linkeind
|
# internal for linkedin
|
||||||
WORLDWIDE = ("worldwide", "www")
|
WORLDWIDE = ("worldwide", "www")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -235,24 +235,9 @@ class IndeedScraper(Scraper):
|
|||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
script_tag = soup.find(
|
|
||||||
"script", text=lambda x: x and "window._initialData" in x
|
|
||||||
)
|
|
||||||
|
|
||||||
if not script_tag:
|
|
||||||
return None
|
|
||||||
|
|
||||||
script_code = script_tag.string
|
|
||||||
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
|
|
||||||
|
|
||||||
if not match:
|
|
||||||
return None
|
|
||||||
|
|
||||||
json_string = match.group(1)
|
|
||||||
data = json.loads(json_string)
|
|
||||||
try:
|
try:
|
||||||
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
|
data = json.loads(response.text)
|
||||||
|
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
|
||||||
"sanitizedJobDescription"
|
"sanitizedJobDescription"
|
||||||
]
|
]
|
||||||
except (KeyError, TypeError, IndexError):
|
except (KeyError, TypeError, IndexError):
|
||||||
|
|||||||
@@ -16,9 +16,9 @@ from threading import Lock
|
|||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
|
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ...jobs import JobPost, Location, JobResponse, JobType, Country
|
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
@@ -135,6 +135,22 @@ class LinkedInScraper(Scraper):
|
|||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
||||||
|
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
||||||
|
|
||||||
|
compensation = None
|
||||||
|
if salary_tag:
|
||||||
|
salary_text = salary_tag.get_text(separator=' ').strip()
|
||||||
|
salary_values = [currency_parser(value) for value in salary_text.split('-')]
|
||||||
|
salary_min = salary_values[0]
|
||||||
|
salary_max = salary_values[1]
|
||||||
|
currency = salary_text[0] if salary_text[0] != '$' else 'USD'
|
||||||
|
|
||||||
|
compensation = Compensation(
|
||||||
|
min_amount=int(salary_min),
|
||||||
|
max_amount=int(salary_max),
|
||||||
|
currency=currency,
|
||||||
|
)
|
||||||
|
|
||||||
title_tag = job_card.find("span", class_="sr-only")
|
title_tag = job_card.find("span", class_="sr-only")
|
||||||
title = title_tag.get_text(strip=True) if title_tag else "N/A"
|
title = title_tag.get_text(strip=True) if title_tag else "N/A"
|
||||||
|
|
||||||
@@ -177,6 +193,7 @@ class LinkedInScraper(Scraper):
|
|||||||
date_posted=date_posted,
|
date_posted=date_posted,
|
||||||
job_url=job_url,
|
job_url=job_url,
|
||||||
job_type=job_type,
|
job_type=job_type,
|
||||||
|
compensation=compensation,
|
||||||
benefits=benefits,
|
benefits=benefits,
|
||||||
emails=extract_emails_from_text(description) if description else None,
|
emails=extract_emails_from_text(description) if description else None,
|
||||||
num_urgent_words=count_urgent_words(description) if description else None,
|
num_urgent_words=count_urgent_words(description) if description else None,
|
||||||
@@ -261,5 +278,12 @@ class LinkedInScraper(Scraper):
|
|||||||
state=state,
|
state=state,
|
||||||
country=Country.from_string(self.country),
|
country=Country.from_string(self.country),
|
||||||
)
|
)
|
||||||
|
elif len(parts) == 3:
|
||||||
|
city, state, country = parts
|
||||||
|
location = Location(
|
||||||
|
city=city,
|
||||||
|
state=state,
|
||||||
|
country=Country.from_string(country),
|
||||||
|
)
|
||||||
|
|
||||||
return location
|
return location
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import tls_client
|
import tls_client
|
||||||
@@ -62,3 +63,19 @@ def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
|||||||
if job_type_str in job_type.value:
|
if job_type_str in job_type.value:
|
||||||
res = job_type
|
res = job_type
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def currency_parser(cur_str):
|
||||||
|
# Remove any non-numerical characters
|
||||||
|
# except for ',' '.' or '-' (e.g. EUR)
|
||||||
|
cur_str = re.sub("[^-0-9.,]", '', cur_str)
|
||||||
|
# Remove any 000s separators (either , or .)
|
||||||
|
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
|
||||||
|
|
||||||
|
if '.' in list(cur_str[-3:]):
|
||||||
|
num = float(cur_str)
|
||||||
|
elif ',' in list(cur_str[-3:]):
|
||||||
|
num = float(cur_str.replace(',', '.'))
|
||||||
|
else:
|
||||||
|
num = float(cur_str)
|
||||||
|
|
||||||
|
return np.round(num, 2)
|
||||||
|
|||||||
Reference in New Issue
Block a user