[fix] indeed job types

indeed descripiton fetching
pull/58/head
Cullen Watson 2023-10-18 14:22:45 -05:00
parent 0163fb1fdf
commit bb21f47537
8 changed files with 59 additions and 48 deletions

View File

@ -92,9 +92,9 @@ JobPost
│ ├── city (str)
│ ├── state (str)
├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract
├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly
│ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int)
│ ├── max_amount (int)
│ └── currency (enum)

View File

@ -2,8 +2,8 @@
name = "python-jobspy"
version = "1.1.14"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"]
homepage = "https://github.com/cullenwatson/JobSpy"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md"
packages = [
@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0"

View File

@ -84,13 +84,12 @@ def scrape_jobs(
except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie
except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN:
raise LinkedInException()
raise LinkedInException(str(e))
if site == Site.INDEED:
raise IndeedException()
raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException()
raise ZipRecruiterException(str(e))
else:
raise e
return site.value, scraped_data

View File

@ -40,12 +40,12 @@ class JobType(Enum):
PART_TIME = (
"parttime",
"teilzeit",
"Částečnýúvazek",
"Deltid"
"částečnýúvazek",
"deltid"
)
CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "Praktik")
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "praktik")
PER_DIEM = ("perdiem",)
NIGHTS = ("nights",)

View File

@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
class LinkedInException(Exception):
"""Failed to scrape LinkedIn"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")
class IndeedException(Exception):
"""Failed to scrape Indeed"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")
class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter"""
def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")

View File

@ -16,7 +16,7 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session
from ..utils import count_urgent_words, extract_emails_from_text, create_session, get_enum_from_job_type
from ...jobs import (
JobPost,
Compensation,
@ -162,10 +162,11 @@ class IndeedScraper(Scraper):
)
return job_post
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [
executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
for job in jobs
]
job_list = [result.result() for result in job_results if result.result()]
@ -230,13 +231,29 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400):
return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][
"jobInfoModel"
]["sanitizedJobDescription"]
with io.StringIO(raw_description) as f:
soup = BeautifulSoup(f, "html.parser")
text_content = " ".join(soup.get_text().split()).strip()
return text_content
soup = BeautifulSoup(response.text, 'html.parser')
script_tag = soup.find('script', text=lambda x: x and 'window._initialData' in x)
if not script_tag:
return None
script_code = script_tag.string
match = re.search(r'window\._initialData\s*=\s*({.*?})\s*;', script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser") # No need for StringIO, pass the string directly
text_content = " ".join(soup.get_text(separator=" ").split()).strip() # Clean and normalize whitespaces
return text_content
@staticmethod
def get_job_type(job: dict) -> list[JobType] | None:
@ -252,22 +269,11 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label")
if label:
job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append(
IndeedScraper.get_enum_from_job_type(job_type_str)
)
job_type = get_enum_from_job_type(job_type_str)
if job_type:
job_types.append(job_type)
return job_types
@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict:
"""
@ -291,7 +297,6 @@ class IndeedScraper(Scraper):
):
return tag
return None
script_tag = find_mosaic_script()
if script_tag:

View File

@ -9,7 +9,6 @@ from datetime import datetime
import requests
import time
import re
from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
@ -17,7 +16,7 @@ from bs4.element import Tag
from threading import Lock
from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException
from ...jobs import (
JobPost,
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return LinkedInScraper.get_enum_from_value(employment_type)
return [get_enum_from_job_type(employment_type)]
return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.

View File

@ -1,5 +1,6 @@
import re
import tls_client
from ..jobs import JobType
def count_urgent_words(description: str) -> int:
@ -42,3 +43,13 @@ def create_session(proxy: str | None = None):
# }
return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res