mirror of https://github.com/Bunsly/JobSpy
parent
5e71866630
commit
f2cc74b7f2
|
@ -92,9 +92,9 @@ JobPost
|
|||
│ ├── city (str)
|
||||
│ ├── state (str)
|
||||
├── description (str)
|
||||
├── job_type (enum): fulltime, parttime, internship, contract
|
||||
├── job_type (str): fulltime, parttime, internship, contract
|
||||
├── compensation (object)
|
||||
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
|
||||
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
||||
│ ├── min_amount (int)
|
||||
│ ├── max_amount (int)
|
||||
│ └── currency (enum)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.13"
|
||||
version = "1.1.14"
|
||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
readme = "README.md"
|
||||
|
||||
packages = [
|
||||
|
@ -16,6 +16,7 @@ requests = "^2.31.0"
|
|||
tls-client = "^0.2.1"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
pandas = "^2.1.0"
|
||||
NUMPY = "1.24.2"
|
||||
pydantic = "^2.3.0"
|
||||
|
||||
|
||||
|
|
|
@ -84,13 +84,12 @@ def scrape_jobs(
|
|||
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
|
||||
raise lie
|
||||
except Exception as e:
|
||||
# unhandled exceptions
|
||||
if site == Site.LINKEDIN:
|
||||
raise LinkedInException()
|
||||
raise LinkedInException(str(e))
|
||||
if site == Site.INDEED:
|
||||
raise IndeedException()
|
||||
raise IndeedException(str(e))
|
||||
if site == Site.ZIP_RECRUITER:
|
||||
raise ZipRecruiterException()
|
||||
raise ZipRecruiterException(str(e))
|
||||
else:
|
||||
raise e
|
||||
return site.value, scraped_data
|
||||
|
|
|
@ -37,10 +37,16 @@ class JobType(Enum):
|
|||
"повназайнятість",
|
||||
"toànthờigian",
|
||||
)
|
||||
PART_TIME = ("parttime", "teilzeit")
|
||||
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
|
||||
CONTRACT = ("contract", "contractor")
|
||||
TEMPORARY = ("temporary",)
|
||||
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
|
||||
INTERNSHIP = (
|
||||
"internship",
|
||||
"prácticas",
|
||||
"ojt(onthejobtraining)",
|
||||
"praktikum",
|
||||
"praktik",
|
||||
)
|
||||
|
||||
PER_DIEM = ("perdiem",)
|
||||
NIGHTS = ("nights",)
|
||||
|
|
|
@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
|
|||
|
||||
|
||||
class LinkedInException(Exception):
|
||||
"""Failed to scrape LinkedIn"""
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with LinkedIn")
|
||||
|
||||
|
||||
class IndeedException(Exception):
|
||||
"""Failed to scrape Indeed"""
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with Indeed")
|
||||
|
||||
|
||||
class ZipRecruiterException(Exception):
|
||||
"""Failed to scrape ZipRecruiter"""
|
||||
def __init__(self, message=None):
|
||||
super().__init__(message or "An error occurred with ZipRecruiter")
|
||||
|
|
|
@ -16,7 +16,12 @@ from bs4.element import Tag
|
|||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
|
||||
from ..exceptions import IndeedException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||
from ..utils import (
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
get_enum_from_job_type,
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
|
@ -162,10 +167,10 @@ class IndeedScraper(Scraper):
|
|||
)
|
||||
return job_post
|
||||
|
||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
job_results: list[Future] = [
|
||||
executor.submit(process_job, job)
|
||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
executor.submit(process_job, job) for job in jobs
|
||||
]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
|
@ -230,12 +235,36 @@ class IndeedScraper(Scraper):
|
|||
if response.status_code not in range(200, 400):
|
||||
return None
|
||||
|
||||
raw_description = response.json()["body"]["jobInfoWrapperModel"][
|
||||
"jobInfoModel"
|
||||
]["sanitizedJobDescription"]
|
||||
with io.StringIO(raw_description) as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
text_content = " ".join(soup.get_text().split()).strip()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
script_tag = soup.find(
|
||||
"script", text=lambda x: x and "window._initialData" in x
|
||||
)
|
||||
|
||||
if not script_tag:
|
||||
return None
|
||||
|
||||
script_code = script_tag.string
|
||||
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
json_string = match.group(1)
|
||||
data = json.loads(json_string)
|
||||
try:
|
||||
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
|
||||
"sanitizedJobDescription"
|
||||
]
|
||||
except (KeyError, TypeError, IndexError):
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(
|
||||
job_description, "html.parser"
|
||||
)
|
||||
text_content = " ".join(
|
||||
soup.get_text(separator=" ").split()
|
||||
).strip()
|
||||
|
||||
return text_content
|
||||
|
||||
@staticmethod
|
||||
|
@ -252,22 +281,11 @@ class IndeedScraper(Scraper):
|
|||
label = taxonomy["attributes"][i].get("label")
|
||||
if label:
|
||||
job_type_str = label.replace("-", "").replace(" ", "").lower()
|
||||
job_types.append(
|
||||
IndeedScraper.get_enum_from_job_type(job_type_str)
|
||||
)
|
||||
job_type = get_enum_from_job_type(job_type_str)
|
||||
if job_type:
|
||||
job_types.append(job_type)
|
||||
return job_types
|
||||
|
||||
@staticmethod
|
||||
def get_enum_from_job_type(job_type_str):
|
||||
"""
|
||||
Given a string, returns the corresponding JobType enum member if a match is found.
|
||||
for job_type in JobType:
|
||||
"""
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return job_type
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def parse_jobs(soup: BeautifulSoup) -> dict:
|
||||
"""
|
||||
|
|
|
@ -9,7 +9,6 @@ from datetime import datetime
|
|||
|
||||
import requests
|
||||
import time
|
||||
import re
|
||||
from requests.exceptions import ProxyError
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -17,7 +16,7 @@ from bs4.element import Tag
|
|||
from threading import Lock
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
|
||||
from ..exceptions import LinkedInException
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
|
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
|
|||
employment_type = employment_type.lower()
|
||||
employment_type = employment_type.replace("-", "")
|
||||
|
||||
return LinkedInScraper.get_enum_from_value(employment_type)
|
||||
return [get_enum_from_job_type(employment_type)]
|
||||
|
||||
return description, get_job_type(soup)
|
||||
|
||||
@staticmethod
|
||||
def get_enum_from_value(value_str):
|
||||
for job_type in JobType:
|
||||
if value_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||
"""
|
||||
Extracts the location data from the job metadata card.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import re
|
||||
import tls_client
|
||||
from ..jobs import JobType
|
||||
|
||||
|
||||
def count_urgent_words(description: str) -> int:
|
||||
|
@ -42,3 +43,14 @@ def create_session(proxy: str | None = None):
|
|||
# }
|
||||
|
||||
return session
|
||||
|
||||
|
||||
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
||||
"""
|
||||
Given a string, returns the corresponding JobType enum member if a match is found.
|
||||
"""
|
||||
res = None
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
res = job_type
|
||||
return res
|
||||
|
|
Loading…
Reference in New Issue