mirror of https://github.com/Bunsly/JobSpy
266 lines
8.6 KiB
Python
266 lines
8.6 KiB
Python
"""
|
|
jobspy.scrapers.Goozali
|
|
~~~~~~~~~~~~~~~~~~~
|
|
|
|
This module contains routines to scrape Goozali.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import time
|
|
import random
|
|
import regex as re
|
|
from typing import Optional
|
|
from datetime import datetime
|
|
|
|
from bs4.element import Tag
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urlparse, urlunparse, unquote
|
|
from requests.exceptions import RetryError, RequestException
|
|
from urllib3.exceptions import MaxRetryError
|
|
from .constants import headers
|
|
from .. import Scraper, ScraperInput, Site
|
|
from ..exceptions import GoozaliException
|
|
from ..utils import create_session, remove_attributes, create_logger
|
|
from ...jobs import (
|
|
JobPost,
|
|
Location,
|
|
JobResponse,
|
|
JobType,
|
|
Country,
|
|
Compensation,
|
|
DescriptionFormat,
|
|
)
|
|
from ..utils import (
|
|
extract_emails_from_text,
|
|
get_enum_from_job_type,
|
|
currency_parser,
|
|
markdown_converter,
|
|
)
|
|
|
|
logger = create_logger("Goozali")
|
|
|
|
|
|
class GoozaliScraper(Scraper):
|
|
base_url = "https://www.Goozali.com"
|
|
delay = 3
|
|
band_delay = 4
|
|
jobs_per_page = 25
|
|
|
|
def __init__(
|
|
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
|
|
):
|
|
"""
|
|
Initializes GoozaliScraper with the Goozalijob search url
|
|
"""
|
|
super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
|
|
self.session = create_session(
|
|
proxies=self.proxies,
|
|
ca_cert=ca_cert,
|
|
is_tls=False,
|
|
has_retry=True,
|
|
delay=5,
|
|
clear_cookies=True,
|
|
)
|
|
self.session.headers.update(headers)
|
|
self.scraper_input = None
|
|
self.country = "worldwide"
|
|
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
|
|
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
|
"""
|
|
Scrapes Goozali for jobs with scraper_input criteria
|
|
:param scraper_input:
|
|
:return: job_response
|
|
"""
|
|
self.scraper_input = scraper_input
|
|
job_list: list[JobPost] = []
|
|
seen_ids = set()
|
|
# create url
|
|
# create session -> run the api
|
|
# model the response with models
|
|
# create map columnId to Column object
|
|
# filter result by Field like the web
|
|
# filter by date
|
|
# map to JobResponse Object
|
|
return JobResponse(jobs=job_list)
|
|
|
|
def _get_job_details(self, job_id: str) -> dict:
|
|
"""
|
|
Retrieves job description and other job details by going to the job page url
|
|
:param job_page_url:
|
|
:return: dict
|
|
"""
|
|
try:
|
|
response = self.session.get(
|
|
f"{self.base_url}/jobs/view/{job_id}", timeout=5
|
|
)
|
|
response.raise_for_status()
|
|
except:
|
|
return {}
|
|
if "Goozali.com/signup" in response.url:
|
|
return {}
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
div_content = soup.find(
|
|
"div", class_=lambda x: x and "show-more-less-html__markup" in x
|
|
)
|
|
description = None
|
|
if div_content is not None:
|
|
div_content = remove_attributes(div_content)
|
|
description = div_content.prettify(formatter="html")
|
|
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
|
description = markdown_converter(description)
|
|
|
|
h3_tag = soup.find(
|
|
"h3", text=lambda text: text and "Job function" in text.strip()
|
|
)
|
|
|
|
job_function = None
|
|
if h3_tag:
|
|
job_function_span = h3_tag.find_next(
|
|
"span", class_="description__job-criteria-text"
|
|
)
|
|
if job_function_span:
|
|
job_function = job_function_span.text.strip()
|
|
|
|
company_logo = (
|
|
logo_image.get("data-delayed-url")
|
|
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
|
|
else None
|
|
)
|
|
return {
|
|
"description": description,
|
|
"job_level": self._parse_job_level(soup),
|
|
"company_industry": self._parse_company_industry(soup),
|
|
"job_type": self._parse_job_type(soup),
|
|
"job_url_direct": self._parse_job_url_direct(soup),
|
|
"company_logo": company_logo,
|
|
"job_function": job_function,
|
|
}
|
|
|
|
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
|
"""
|
|
Extracts the location data from the job metadata card.
|
|
:param metadata_card
|
|
:return: location
|
|
"""
|
|
location = Location(country=Country.from_string(self.country))
|
|
if metadata_card is not None:
|
|
location_tag = metadata_card.find(
|
|
"span", class_="job-search-card__location"
|
|
)
|
|
location_string = location_tag.text.strip() if location_tag else "N/A"
|
|
parts = location_string.split(", ")
|
|
if len(parts) == 2:
|
|
city, state = parts
|
|
location = Location(
|
|
city=city,
|
|
state=state,
|
|
country=Country.from_string(self.country),
|
|
)
|
|
elif len(parts) == 3:
|
|
city, state, country = parts
|
|
country = Country.from_string(country)
|
|
location = Location(city=city, state=state, country=country)
|
|
return location
|
|
|
|
@staticmethod
|
|
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
|
"""
|
|
Gets the job type from job page
|
|
:param soup_job_type:
|
|
:return: JobType
|
|
"""
|
|
h3_tag = soup_job_type.find(
|
|
"h3",
|
|
class_="description__job-criteria-subheader",
|
|
string=lambda text: "Employment type" in text,
|
|
)
|
|
employment_type = None
|
|
if h3_tag:
|
|
employment_type_span = h3_tag.find_next_sibling(
|
|
"span",
|
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
)
|
|
if employment_type_span:
|
|
employment_type = employment_type_span.get_text(strip=True)
|
|
employment_type = employment_type.lower()
|
|
employment_type = employment_type.replace("-", "")
|
|
|
|
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
|
|
|
@staticmethod
|
|
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
|
"""
|
|
Gets the job level from job page
|
|
:param soup_job_level:
|
|
:return: str
|
|
"""
|
|
h3_tag = soup_job_level.find(
|
|
"h3",
|
|
class_="description__job-criteria-subheader",
|
|
string=lambda text: "Seniority level" in text,
|
|
)
|
|
job_level = None
|
|
if h3_tag:
|
|
job_level_span = h3_tag.find_next_sibling(
|
|
"span",
|
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
)
|
|
if job_level_span:
|
|
job_level = job_level_span.get_text(strip=True)
|
|
|
|
return job_level
|
|
|
|
@staticmethod
|
|
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
|
"""
|
|
Gets the company industry from job page
|
|
:param soup_industry:
|
|
:return: str
|
|
"""
|
|
h3_tag = soup_industry.find(
|
|
"h3",
|
|
class_="description__job-criteria-subheader",
|
|
string=lambda text: "Industries" in text,
|
|
)
|
|
industry = None
|
|
if h3_tag:
|
|
industry_span = h3_tag.find_next_sibling(
|
|
"span",
|
|
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
)
|
|
if industry_span:
|
|
industry = industry_span.get_text(strip=True)
|
|
|
|
return industry
|
|
|
|
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
|
"""
|
|
Gets the job url direct from job page
|
|
:param soup:
|
|
:return: str
|
|
"""
|
|
job_url_direct = None
|
|
job_url_direct_content = soup.find("code", id="applyUrl")
|
|
if job_url_direct_content:
|
|
job_url_direct_match = self.job_url_direct_regex.search(
|
|
job_url_direct_content.decode_contents().strip()
|
|
)
|
|
if job_url_direct_match:
|
|
job_url_direct = unquote(job_url_direct_match.group())
|
|
|
|
return job_url_direct
|
|
|
|
@staticmethod
|
|
def job_type_code(job_type_enum: JobType) -> str:
|
|
return {
|
|
JobType.FULL_TIME: "F",
|
|
JobType.PART_TIME: "P",
|
|
JobType.INTERNSHIP: "I",
|
|
JobType.CONTRACT: "C",
|
|
JobType.TEMPORARY: "T",
|
|
}.get(job_type_enum, "")
|