[fix] indeed job types

indeed descripiton fetching
pull/58/head
Cullen Watson 2023-10-18 14:22:45 -05:00
parent 0163fb1fdf
commit bb21f47537
8 changed files with 59 additions and 48 deletions

View File

@ -92,9 +92,9 @@ JobPost
│ ├── city (str) │ ├── city (str)
│ ├── state (str) │ ├── state (str)
├── description (str) ├── description (str)
├── job_type (enum): fulltime, parttime, internship, contract ├── job_type (str): fulltime, parttime, internship, contract
├── compensation (object) ├── compensation (object)
│ ├── interval (enum): yearly, monthly, weekly, daily, hourly │ ├── interval (str): yearly, monthly, weekly, daily, hourly
│ ├── min_amount (int) │ ├── min_amount (int)
│ ├── max_amount (int) │ ├── max_amount (int)
│ └── currency (enum) │ └── currency (enum)

View File

@ -2,8 +2,8 @@
name = "python-jobspy" name = "python-jobspy"
version = "1.1.14" version = "1.1.14"
description = "Job scraper for LinkedIn, Indeed & ZipRecruiter" description = "Job scraper for LinkedIn, Indeed & ZipRecruiter"
authors = ["Zachary Hampton <zachary@zacharysproducts.com>", "Cullen Watson <cullen@cullen.ai>"] authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/cullenwatson/JobSpy" homepage = "https://github.com/Bunsly/JobSpy"
readme = "README.md" readme = "README.md"
packages = [ packages = [
@ -16,6 +16,7 @@ requests = "^2.31.0"
tls-client = "^0.2.1" tls-client = "^0.2.1"
beautifulsoup4 = "^4.12.2" beautifulsoup4 = "^4.12.2"
pandas = "^2.1.0" pandas = "^2.1.0"
NUMPY = "1.24.2"
pydantic = "^2.3.0" pydantic = "^2.3.0"

View File

@ -84,13 +84,12 @@ def scrape_jobs(
except (LinkedInException, IndeedException, ZipRecruiterException) as lie: except (LinkedInException, IndeedException, ZipRecruiterException) as lie:
raise lie raise lie
except Exception as e: except Exception as e:
# unhandled exceptions
if site == Site.LINKEDIN: if site == Site.LINKEDIN:
raise LinkedInException() raise LinkedInException(str(e))
if site == Site.INDEED: if site == Site.INDEED:
raise IndeedException() raise IndeedException(str(e))
if site == Site.ZIP_RECRUITER: if site == Site.ZIP_RECRUITER:
raise ZipRecruiterException() raise ZipRecruiterException(str(e))
else: else:
raise e raise e
return site.value, scraped_data return site.value, scraped_data

View File

@ -40,12 +40,12 @@ class JobType(Enum):
PART_TIME = ( PART_TIME = (
"parttime", "parttime",
"teilzeit", "teilzeit",
"Částečnýúvazek", "částečnýúvazek",
"Deltid" "deltid"
) )
CONTRACT = ("contract", "contractor") CONTRACT = ("contract", "contractor")
TEMPORARY = ("temporary",) TEMPORARY = ("temporary",)
INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "Praktik") INTERNSHIP = ("internship", "prácticas", "ojt(onthejobtraining)", "praktikum", "praktik")
PER_DIEM = ("perdiem",) PER_DIEM = ("perdiem",)
NIGHTS = ("nights",) NIGHTS = ("nights",)

View File

@ -7,12 +7,15 @@ This module contains the set of Scrapers' exceptions.
class LinkedInException(Exception): class LinkedInException(Exception):
"""Failed to scrape LinkedIn""" def __init__(self, message=None):
super().__init__(message or "An error occurred with LinkedIn")
class IndeedException(Exception): class IndeedException(Exception):
"""Failed to scrape Indeed""" def __init__(self, message=None):
super().__init__(message or "An error occurred with Indeed")
class ZipRecruiterException(Exception): class ZipRecruiterException(Exception):
"""Failed to scrape ZipRecruiter""" def __init__(self, message=None):
super().__init__(message or "An error occurred with ZipRecruiter")

View File

@ -16,7 +16,7 @@ from bs4.element import Tag
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
from ..exceptions import IndeedException from ..exceptions import IndeedException
from ..utils import count_urgent_words, extract_emails_from_text, create_session from ..utils import count_urgent_words, extract_emails_from_text, create_session, get_enum_from_job_type
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Compensation, Compensation,
@ -162,10 +162,11 @@ class IndeedScraper(Scraper):
) )
return job_post return job_post
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor: with ThreadPoolExecutor(max_workers=1) as executor:
job_results: list[Future] = [ job_results: list[Future] = [
executor.submit(process_job, job) executor.submit(process_job, job)
for job in jobs["metaData"]["mosaicProviderJobCardsModel"]["results"] for job in jobs
] ]
job_list = [result.result() for result in job_results if result.result()] job_list = [result.result() for result in job_results if result.result()]
@ -230,12 +231,28 @@ class IndeedScraper(Scraper):
if response.status_code not in range(200, 400): if response.status_code not in range(200, 400):
return None return None
raw_description = response.json()["body"]["jobInfoWrapperModel"][ soup = BeautifulSoup(response.text, 'html.parser')
"jobInfoModel" script_tag = soup.find('script', text=lambda x: x and 'window._initialData' in x)
]["sanitizedJobDescription"]
with io.StringIO(raw_description) as f: if not script_tag:
soup = BeautifulSoup(f, "html.parser") return None
text_content = " ".join(soup.get_text().split()).strip()
script_code = script_tag.string
match = re.search(r'window\._initialData\s*=\s*({.*?})\s*;', script_code, re.S)
if not match:
return None
json_string = match.group(1)
data = json.loads(json_string)
try:
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
except (KeyError, TypeError, IndexError):
return None
soup = BeautifulSoup(job_description, "html.parser") # No need for StringIO, pass the string directly
text_content = " ".join(soup.get_text(separator=" ").split()).strip() # Clean and normalize whitespaces
return text_content return text_content
@staticmethod @staticmethod
@ -252,22 +269,11 @@ class IndeedScraper(Scraper):
label = taxonomy["attributes"][i].get("label") label = taxonomy["attributes"][i].get("label")
if label: if label:
job_type_str = label.replace("-", "").replace(" ", "").lower() job_type_str = label.replace("-", "").replace(" ", "").lower()
job_types.append( job_type = get_enum_from_job_type(job_type_str)
IndeedScraper.get_enum_from_job_type(job_type_str) if job_type:
) job_types.append(job_type)
return job_types return job_types
@staticmethod
def get_enum_from_job_type(job_type_str):
"""
Given a string, returns the corresponding JobType enum member if a match is found.
for job_type in JobType:
"""
for job_type in JobType:
if job_type_str in job_type.value:
return job_type
return None
@staticmethod @staticmethod
def parse_jobs(soup: BeautifulSoup) -> dict: def parse_jobs(soup: BeautifulSoup) -> dict:
""" """
@ -291,7 +297,6 @@ class IndeedScraper(Scraper):
): ):
return tag return tag
return None return None
script_tag = find_mosaic_script() script_tag = find_mosaic_script()
if script_tag: if script_tag:

View File

@ -9,7 +9,6 @@ from datetime import datetime
import requests import requests
import time import time
import re
from requests.exceptions import ProxyError from requests.exceptions import ProxyError
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -17,7 +16,7 @@ from bs4.element import Tag
from threading import Lock from threading import Lock
from .. import Scraper, ScraperInput, Site from .. import Scraper, ScraperInput, Site
from ..utils import count_urgent_words, extract_emails_from_text from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type
from ..exceptions import LinkedInException from ..exceptions import LinkedInException
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
@ -237,17 +236,10 @@ class LinkedInScraper(Scraper):
employment_type = employment_type.lower() employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "") employment_type = employment_type.replace("-", "")
return LinkedInScraper.get_enum_from_value(employment_type) return [get_enum_from_job_type(employment_type)]
return description, get_job_type(soup) return description, get_job_type(soup)
@staticmethod
def get_enum_from_value(value_str):
for job_type in JobType:
if value_str in job_type.value:
return [job_type]
return None
def get_location(self, metadata_card: Optional[Tag]) -> Location: def get_location(self, metadata_card: Optional[Tag]) -> Location:
""" """
Extracts the location data from the job metadata card. Extracts the location data from the job metadata card.

View File

@ -1,5 +1,6 @@
import re import re
import tls_client import tls_client
from ..jobs import JobType
def count_urgent_words(description: str) -> int: def count_urgent_words(description: str) -> int:
@ -42,3 +43,13 @@ def create_session(proxy: str | None = None):
# } # }
return session return session
def get_enum_from_job_type(job_type_str: str) -> JobType | None:
"""
Given a string, returns the corresponding JobType enum member if a match is found.
"""
res = None
for job_type in JobType:
if job_type_str in job_type.value:
res = job_type
return res