mirror of https://github.com/Bunsly/JobSpy
parent
5e71866630
commit
f2cc74b7f2
|
@ -92,9 +92,9 @@ JobPost
|
||||||
│ ├── city (str)
|
│ ├── city (str)
|
||||||
│ ├── state (str)
|
│ ├── state (str)
|
||||||
├── description (str)
|
├── description (str)
|
||||||
├── job_type (enum): fulltime, parttime, internship, contract
|
├── job_type (str): fulltime, parttime, internship, contract
|
||||||
├── compensation (object)
|
├── compensation (object)
|
||||||
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
|
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
|
||||||
│ ├── min_amount (int)
|
│ ├── min_amount (int)
|
||||||
│ ├── max_amount (int)
|
│ ├── max_amount (int)
|
||||||
│ └── currency (enum)
|
│ └── currency (enum)
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.13"
|
version = "1.1.14"
|
||||||
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/cullenwatson/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
packages = [
|
packages = [
|
||||||
|
@ -16,6 +16,7 @@ requests = "^2.31.0"
|
||||||
tls-client = "^0.2.1"
|
tls-client = "^0.2.1"
|
||||||
beautifulsoup4 = "^4.12.2"
|
beautifulsoup4 = "^4.12.2"
|
||||||
pandas = "^2.1.0"
|
pandas = "^2.1.0"
|
||||||
|
NUMPY = "1.24.2"
|
||||||
pydantic = "^2.3.0"
|
pydantic = "^2.3.0"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -84,13 +84,12 @@ def scrape_jobs(
|
||||||
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
|
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
|
||||||
raise lie
|
raise lie
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# unhandled exceptions
|
|
||||||
if site == Site.LINKEDIN:
|
if site == Site.LINKEDIN:
|
||||||
raise LinkedInException()
|
raise LinkedInException(str(e))
|
||||||
if site == Site.INDEED:
|
if site == Site.INDEED:
|
||||||
raise IndeedException()
|
raise IndeedException(str(e))
|
||||||
if site == Site.ZIP_RECRUITER:
|
if site == Site.ZIP_RECRUITER:
|
||||||
raise ZipRecruiterException()
|
raise ZipRecruiterException(str(e))
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
return site.value, scraped_data
|
return site.value, scraped_data
|
||||||
|
|
|
@ -37,10 +37,16 @@ class JobType(Enum):
|
||||||
"повназайнятість",
|
"повназайнятість",
|
||||||
"toànthờigian",
|
"toànthờigian",
|
||||||
)
|
)
|
||||||
PART_TIME = ("parttime", "teilzeit")
|
PART_TIME = ("parttime", "teilzeit", "částečnýúvazek", "deltid")
|
||||||
CONTRACT = ("contract", "contractor")
|
CONTRACT = ("contract", "contractor")
|
||||||
TEMPORARY = ("temporary",)
|
TEMPORARY = ("temporary",)
|
||||||
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum")
|
INTERNSHIP = (
|
||||||
|
"internship",
|
||||||
|
"prácticas",
|
||||||
|
"ojt(onthejobtraining)",
|
||||||
|
"praktikum",
|
||||||
|
"praktik",
|
||||||
|
)
|
||||||
|
|
||||||
PER_DIEM = ("perdiem",)
|
PER_DIEM = ("perdiem",)
|
||||||
NIGHTS = ("nights",)
|
NIGHTS = ("nights",)
|
||||||
|
|
|
@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
|
||||||
|
|
||||||
|
|
||||||
class LinkedInException(Exception):
|
class LinkedInException(Exception):
|
||||||
"""Failed to scrape LinkedIn"""
|
def __init__(self, message=None):
|
||||||
|
super().__init__(message or "An error occurred with LinkedIn")
|
||||||
|
|
||||||
|
|
||||||
class IndeedException(Exception):
|
class IndeedException(Exception):
|
||||||
"""Failed to scrape Indeed"""
|
def __init__(self, message=None):
|
||||||
|
super().__init__(message or "An error occurred with Indeed")
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterException(Exception):
|
class ZipRecruiterException(Exception):
|
||||||
"""Failed to scrape ZipRecruiter"""
|
def __init__(self, message=None):
|
||||||
|
super().__init__(message or "An error occurred with ZipRecruiter")
|
||||||
|
|
|
@ -16,7 +16,12 @@ from bs4.element import Tag
|
||||||
from concurrent.futures import ThreadPoolExecutor, Future
|
from concurrent.futures import ThreadPoolExecutor, Future
|
||||||
|
|
||||||
from ..exceptions import IndeedException
|
from ..exceptions import IndeedException
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
from ..utils import (
|
||||||
|
count_urgent_words,
|
||||||
|
extract_emails_from_text,
|
||||||
|
create_session,
|
||||||
|
get_enum_from_job_type,
|
||||||
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
|
@ -162,10 +167,10 @@ class IndeedScraper(Scraper):
|
||||||
)
|
)
|
||||||
return job_post
|
return job_post
|
||||||
|
|
||||||
|
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
job_results: list[Future] = [
|
job_results: list[Future] = [
|
||||||
executor.submit(process_job, job)
|
executor.submit(process_job, job) for job in jobs
|
||||||
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
|
||||||
]
|
]
|
||||||
|
|
||||||
job_list = [result.result() for result in job_results if result.result()]
|
job_list = [result.result() for result in job_results if result.result()]
|
||||||
|
@ -230,13 +235,37 @@ class IndeedScraper(Scraper):
|
||||||
if response.status_code not in range(200, 400):
|
if response.status_code not in range(200, 400):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
raw_description = response.json()["body"]["jobInfoWrapperModel"][
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
"jobInfoModel"
|
script_tag = soup.find(
|
||||||
]["sanitizedJobDescription"]
|
"script", text=lambda x: x and "window._initialData" in x
|
||||||
with io.StringIO(raw_description) as f:
|
)
|
||||||
soup = BeautifulSoup(f, "html.parser")
|
|
||||||
text_content = " ".join(soup.get_text().split()).strip()
|
if not script_tag:
|
||||||
return text_content
|
return None
|
||||||
|
|
||||||
|
script_code = script_tag.string
|
||||||
|
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
json_string = match.group(1)
|
||||||
|
data = json.loads(json_string)
|
||||||
|
try:
|
||||||
|
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
|
||||||
|
"sanitizedJobDescription"
|
||||||
|
]
|
||||||
|
except (KeyError, TypeError, IndexError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(
|
||||||
|
job_description, "html.parser"
|
||||||
|
)
|
||||||
|
text_content = " ".join(
|
||||||
|
soup.get_text(separator=" ").split()
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
return text_content
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_job_type(job: dict) -> list[JobType] | None:
|
def get_job_type(job: dict) -> list[JobType] | None:
|
||||||
|
@ -252,22 +281,11 @@ class IndeedScraper(Scraper):
|
||||||
label = taxonomy["attributes"][i].get("label")
|
label = taxonomy["attributes"][i].get("label")
|
||||||
if label:
|
if label:
|
||||||
job_type_str = label.replace("-", "").replace(" ", "").lower()
|
job_type_str = label.replace("-", "").replace(" ", "").lower()
|
||||||
job_types.append(
|
job_type = get_enum_from_job_type(job_type_str)
|
||||||
IndeedScraper.get_enum_from_job_type(job_type_str)
|
if job_type:
|
||||||
)
|
job_types.append(job_type)
|
||||||
return job_types
|
return job_types
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_enum_from_job_type(job_type_str):
|
|
||||||
"""
|
|
||||||
Given a string, returns the corresponding JobType enum member if a match is found.
|
|
||||||
for job_type in JobType:
|
|
||||||
"""
|
|
||||||
for job_type in JobType:
|
|
||||||
if job_type_str in job_type.value:
|
|
||||||
return job_type
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_jobs(soup: BeautifulSoup) -> dict:
|
def parse_jobs(soup: BeautifulSoup) -> dict:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -9,7 +9,6 @@ from datetime import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
import re
|
|
||||||
from requests.exceptions import ProxyError
|
from requests.exceptions import ProxyError
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
@ -17,7 +16,7 @@ from bs4.element import Tag
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text
|
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
|
||||||
employment_type = employment_type.lower()
|
employment_type = employment_type.lower()
|
||||||
employment_type = employment_type.replace("-", "")
|
employment_type = employment_type.replace("-", "")
|
||||||
|
|
||||||
return LinkedInScraper.get_enum_from_value(employment_type)
|
return [get_enum_from_job_type(employment_type)]
|
||||||
|
|
||||||
return description, get_job_type(soup)
|
return description, get_job_type(soup)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_enum_from_value(value_str):
|
|
||||||
for job_type in JobType:
|
|
||||||
if value_str in job_type.value:
|
|
||||||
return [job_type]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
def get_location(self, metadata_card: Optional[Tag]) -> Location:
|
||||||
"""
|
"""
|
||||||
Extracts the location data from the job metadata card.
|
Extracts the location data from the job metadata card.
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import re
|
import re
|
||||||
import tls_client
|
import tls_client
|
||||||
|
from ..jobs import JobType
|
||||||
|
|
||||||
|
|
||||||
def count_urgent_words(description: str) -> int:
|
def count_urgent_words(description: str) -> int:
|
||||||
|
@ -42,3 +43,14 @@ def create_session(proxy: str | None = None):
|
||||||
# }
|
# }
|
||||||
|
|
||||||
return session
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
|
||||||
|
"""
|
||||||
|
Given a string, returns the corresponding JobType enum member if a match is found.
|
||||||
|
"""
|
||||||
|
res = None
|
||||||
|
for job_type in JobType:
|
||||||
|
if job_type_str in job_type.value:
|
||||||
|
res = job_type
|
||||||
|
return res
|
||||||
|
|
Loading…
Reference in New Issue