init commit

models ready, logic file for goozali based on the linkdin scraper
2024-12-12 17:22:58 +02:00 · 2024-12-12 17:22:58 +02:00 · 3dc15195d5
parent f0ea89b357
commit 3dc15195d5
13 changed files with 605 additions and 10 deletions
--- a/src/jobspy/main.py
+++ b/src/jobspy/main.py
@ -1,16 +1,26 @@
 import asyncio
 from enum import Enum
 from db.job_repository import JobRepository
 from jobspy import scrape_jobs
 from jobspy.telegram_bot import TelegramBot
 class Site(Enum):
    LINKEDIN = "linkedin"
    GOOZALI = "goozali"
    INDEED = "indeed"
    ZIP_RECRUITER = "zip_recruiter"
    GLASSDOOR = "glassdoor"
    GOOGLE = "google"
 async def main():
    telegramBot = TelegramBot()
    jobRepository = JobRepository()
    jobs = scrape_jobs(
-        # site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
+        # site_name=[Site.LINKEDIN, Site.GOOZALI, Site.GLASSDOOR, Site.INDEED],
-        site_name=["indeed"],
+        site_name=[Site.GOOZALI],
        search_term="software engineer",
        google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
        location="Central, Israel",
--- a/src/jobspy/scrapers/init.py
+++ b/src/jobspy/scrapers/init.py
@ -2,6 +2,8 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from jobspy.main import Site
 from ..jobs import (
    Enum,
    BaseModel,
@ -12,14 +14,6 @@ from ..jobs import (
 )
 class Site(Enum):
    LINKEDIN = "linkedin"
    INDEED = "indeed"
    ZIP_RECRUITER = "zip_recruiter"
    GLASSDOOR = "glassdoor"
    GOOGLE = "google"
 class SalarySource(Enum):
    DIRECT_DATA = "direct_data"
    DESCRIPTION = "description"
--- a/src/jobspy/scrapers/goozali/init.py
+++ b/src/jobspy/scrapers/goozali/init.py
@ -0,0 +1,439 @@
 """
 jobspy.scrapers.Goozali
 ~~~~~~~~~~~~~~~~~~~
 This module contains routines to scrape Goozali.
 """
 from __future__ import annotations
 import math
 import time
 import random
 import regex as re
 from typing import Optional
 from datetime import datetime
 from bs4.element import Tag
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse, unquote
 from requests.exceptions import RetryError, RequestException
 from urllib3.exceptions import MaxRetryError
 from .constants import headers
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import GoozaliException
 from ..utils import create_session, remove_attributes, create_logger
 from ...jobs import (
    JobPost,
    Location,
    JobResponse,
    JobType,
    Country,
    Compensation,
    DescriptionFormat,
 )
 from ..utils import (
    extract_emails_from_text,
    get_enum_from_job_type,
    currency_parser,
    markdown_converter,
 )
 logger = create_logger("Goozali")
 class GoozaliScraper(Scraper):
    base_url = "https://www.Goozali.com"
    delay = 3
    band_delay = 4
    jobs_per_page = 25
    def __init__(
        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
    ):
        """
        Initializes GoozaliScraper with the Goozalijob search url
        """
        super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
        self.session = create_session(
            proxies=self.proxies,
            ca_cert=ca_cert,
            is_tls=False,
            has_retry=True,
            delay=5,
            clear_cookies=True,
        )
        self.session.headers.update(headers)
        self.scraper_input = None
        self.country = "worldwide"
        self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
        Scrapes Goozali for jobs with scraper_input criteria
        :param scraper_input:
        :return: job_response
        """
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
        seen_ids = set()
        start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
        request_count = 0
        seconds_old = (
            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
        )
        continue_search = (
            lambda: len(
                job_list) < scraper_input.results_wanted and start < 1000
        )
        for location in scraper_input.locations:
            logger.info(f"start searching for location: {location}")
            while continue_search():
                request_count += 1
                logger.info(
                    f"search page: {
                        request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
                )
                params = {
                    "keywords": scraper_input.search_term,
                    "location": location,
                    "distance": scraper_input.distance,
                    "f_WT": 2 if scraper_input.is_remote else None,
                    "f_JT": (
                        self.job_type_code(scraper_input.job_type)
                        if scraper_input.job_type
                        else None
                    ),
                    "pageNum": 0,
                    "start": start,
                    "f_AL": "true" if scraper_input.easy_apply else None,
                    "f_C": (
                        ",".join(map(str, scraper_input.Goozali_company_ids))
                        if scraper_input.Goozali_company_ids
                        else None
                    ),
                }
                if seconds_old is not None:
                    params["f_TPR"] = f"r{seconds_old}"
                params = {k: v for k, v in params.items() if v is not None}
                try:
                    response = self.session.get(
                        f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
                        params=params,
                        timeout=10,
                    )
                    if response.status_code not in range(200, 400):
                        if response.status_code == 429:
                            err = (
                                f"429 Response - Blocked by Goozali for too many requests"
                            )
                        else:
                            err = f"Goozali response status code {
                                response.status_code}"
                            err += f" - {response.text}"
                        logger.error(err)
                        return JobResponse(jobs=job_list)
                except MaxRetryError as e:
                    """Raised when the maximum number of retries is exceeded."""
                    logger.error(f"RetryError: {str(e)}")
                    logger.error(f"MaxRetryError for location: {location}")
                    break
                except RetryError as e:
                    """Custom retries logic failed"""
                    logger.error(f"RetryError: {str(e)}")
                    logger.error(f"RetryError for location: {location}")
                    break
                except Exception as e:
                    if "Proxy responded with" in str(e):
                        logger.error(f"Goozali: Bad proxy")
                    else:
                        logger.error(f"Goozali: {str(e)}")
                    return JobResponse(jobs=job_list)
                soup = BeautifulSoup(response.text, "html.parser")
                job_cards = soup.find_all("div", class_="base-search-card")
                if len(job_cards) == 0:
                    break
                for job_card in job_cards:
                    href_tag = job_card.find(
                        "a", class_="base-card__full-link")
                    if href_tag and "href" in href_tag.attrs:
                        href = href_tag.attrs["href"].split("?")[0]
                        job_id = href.split("-")[-1]
                        if job_id in seen_ids:
                            continue
                        seen_ids.add(job_id)
                        try:
                            fetch_desc = scraper_input.Goozali_fetch_description
                            job_post = self._process_job(
                                job_card, job_id, fetch_desc)
                            if job_post:
                                job_list.append(job_post)
                            if not continue_search():
                                break
                        except Exception as e:
                            raise GoozaliException(str(e))
                if continue_search():
                    time.sleep(random.uniform(
                        self.delay, self.delay + self.band_delay))
                    start += len(job_list)
        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
    def _process_job(
        self, job_card: Tag, job_id: str, full_descr: bool
    ) -> Optional[JobPost]:
        salary_tag = job_card.find(
            "span", class_="job-search-card__salary-info")
        compensation = None
        if salary_tag:
            salary_text = salary_tag.get_text(separator=" ").strip()
            salary_values = [currency_parser(value)
                             for value in salary_text.split("-")]
            salary_min = salary_values[0]
            salary_max = salary_values[1]
            currency = salary_text[0] if salary_text[0] != "$" else "USD"
            compensation = Compensation(
                min_amount=int(salary_min),
                max_amount=int(salary_max),
                currency=currency,
            )
        title_tag = job_card.find("span", class_="sr-only")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"
        company_tag = job_card.find("h4", class_="base-search-card__subtitle")
        company_a_tag = company_tag.find("a") if company_tag else None
        company_url = (
            urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
            if company_a_tag and company_a_tag.has_attr("href")
            else ""
        )
        company = company_a_tag.get_text(
            strip=True) if company_a_tag else "N/A"
        metadata_card = job_card.find(
            "div", class_="base-search-card__metadata")
        location = self._get_location(metadata_card)
        datetime_tag = (
            metadata_card.find("time", class_="job-search-card__listdate")
            if metadata_card
            else None
        )
        date_posted = None
        if datetime_tag and "datetime" in datetime_tag.attrs:
            datetime_str = datetime_tag["datetime"]
            try:
                date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
            except:
                date_posted = None
        job_details = {}
        if full_descr:
            job_details = self._get_job_details(job_id)
        return JobPost(
            id=f"li-{job_id}",
            title=title,
            company_name=company,
            company_url=company_url,
            location=location,
            date_posted=date_posted,
            datetime_posted=date_posted,
            job_url=f"{self.base_url}/jobs/view/{job_id}",
            compensation=compensation,
            job_type=job_details.get("job_type"),
            job_level=job_details.get("job_level", "").lower(),
            company_industry=job_details.get("company_industry"),
            description=job_details.get("description"),
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
            company_logo=job_details.get("company_logo"),
            job_function=job_details.get("job_function"),
        )
    def _get_job_details(self, job_id: str) -> dict:
        """
        Retrieves job description and other job details by going to the job page url
        :param job_page_url:
        :return: dict
        """
        try:
            response = self.session.get(
                f"{self.base_url}/jobs/view/{job_id}", timeout=5
            )
            response.raise_for_status()
        except:
            return {}
        if "Goozali.com/signup" in response.url:
            return {}
        soup = BeautifulSoup(response.text, "html.parser")
        div_content = soup.find(
            "div", class_=lambda x: x and "show-more-less-html__markup" in x
        )
        description = None
        if div_content is not None:
            div_content = remove_attributes(div_content)
            description = div_content.prettify(formatter="html")
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description = markdown_converter(description)
        h3_tag = soup.find(
            "h3", text=lambda text: text and "Job function" in text.strip()
        )
        job_function = None
        if h3_tag:
            job_function_span = h3_tag.find_next(
                "span", class_="description__job-criteria-text"
            )
            if job_function_span:
                job_function = job_function_span.text.strip()
        company_logo = (
            logo_image.get("data-delayed-url")
            if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
            else None
        )
        return {
            "description": description,
            "job_level": self._parse_job_level(soup),
            "company_industry": self._parse_company_industry(soup),
            "job_type": self._parse_job_type(soup),
            "job_url_direct": self._parse_job_url_direct(soup),
            "company_logo": company_logo,
            "job_function": job_function,
        }
    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
        """
        Extracts the location data from the job metadata card.
        :param metadata_card
        :return: location
        """
        location = Location(country=Country.from_string(self.country))
        if metadata_card is not None:
            location_tag = metadata_card.find(
                "span", class_="job-search-card__location"
            )
            location_string = location_tag.text.strip() if location_tag else "N/A"
            parts = location_string.split(", ")
            if len(parts) == 2:
                city, state = parts
                location = Location(
                    city=city,
                    state=state,
                    country=Country.from_string(self.country),
                )
            elif len(parts) == 3:
                city, state, country = parts
                country = Country.from_string(country)
                location = Location(city=city, state=state, country=country)
        return location
    @staticmethod
    def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
        """
        Gets the job type from job page
        :param soup_job_type:
        :return: JobType
        """
        h3_tag = soup_job_type.find(
            "h3",
            class_="description__job-criteria-subheader",
            string=lambda text: "Employment type" in text,
        )
        employment_type = None
        if h3_tag:
            employment_type_span = h3_tag.find_next_sibling(
                "span",
                class_="description__job-criteria-text description__job-criteria-text--criteria",
            )
            if employment_type_span:
                employment_type = employment_type_span.get_text(strip=True)
                employment_type = employment_type.lower()
                employment_type = employment_type.replace("-", "")
        return [get_enum_from_job_type(employment_type)] if employment_type else []
    @staticmethod
    def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
        """
        Gets the job level from job page
        :param soup_job_level:
        :return: str
        """
        h3_tag = soup_job_level.find(
            "h3",
            class_="description__job-criteria-subheader",
            string=lambda text: "Seniority level" in text,
        )
        job_level = None
        if h3_tag:
            job_level_span = h3_tag.find_next_sibling(
                "span",
                class_="description__job-criteria-text description__job-criteria-text--criteria",
            )
            if job_level_span:
                job_level = job_level_span.get_text(strip=True)
        return job_level
    @staticmethod
    def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
        """
        Gets the company industry from job page
        :param soup_industry:
        :return: str
        """
        h3_tag = soup_industry.find(
            "h3",
            class_="description__job-criteria-subheader",
            string=lambda text: "Industries" in text,
        )
        industry = None
        if h3_tag:
            industry_span = h3_tag.find_next_sibling(
                "span",
                class_="description__job-criteria-text description__job-criteria-text--criteria",
            )
            if industry_span:
                industry = industry_span.get_text(strip=True)
        return industry
    def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
        """
        Gets the job url direct from job page
        :param soup:
        :return: str
        """
        job_url_direct = None
        job_url_direct_content = soup.find("code", id="applyUrl")
        if job_url_direct_content:
            job_url_direct_match = self.job_url_direct_regex.search(
                job_url_direct_content.decode_contents().strip()
            )
            if job_url_direct_match:
                job_url_direct = unquote(job_url_direct_match.group())
        return job_url_direct
    @staticmethod
    def job_type_code(job_type_enum: JobType) -> str:
        return {
            JobType.FULL_TIME: "F",
            JobType.PART_TIME: "P",
            JobType.INTERNSHIP: "I",
            JobType.CONTRACT: "C",
            JobType.TEMPORARY: "T",
        }.get(job_type_enum, "")
--- a/src/jobspy/scrapers/goozali/constants.py
+++ b/src/jobspy/scrapers/goozali/constants.py
@ -0,0 +1,8 @@
 headers = {
    "authority": "www.linkedin.com",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "max-age=0",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 }
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumn.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumn.py
@ -0,0 +1,20 @@
 from typing import Optional
 from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions
 class GoozaliColumn:
    def __init__(self, id: str, name: str, description: Optional[str], type: str, typeOptions: GoozaliColumnTypeOptions,
                 default: Optional[str], initialCreatedTime: str, initialCreatedByUserId: str,
                 lastModifiedTime: str, lastModifiedByUserId: str, isEditableFromSync: bool):
        self.id = id
        self.name = name
        self.description = description
        self.type = type
        self.typeOptions = typeOptions
        self.default = default
        self.initialCreatedTime = initialCreatedTime
        self.initialCreatedByUserId = initialCreatedByUserId
        self.lastModifiedTime = lastModifiedTime
        self.lastModifiedByUserId = lastModifiedByUserId
        self.isEditableFromSync = isEditableFromSync
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py
@ -0,0 +1,8 @@
 from typing import Optional
 class GoozaliColumnChoice:
    def __init__(self, id: str, name: str, color: Optional[str] = None):
        self.id = id
        self.name = name
        self.color = color
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py
@ -0,0 +1,10 @@
 from typing import Dict, List
 from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
 class GoozaliColumnTypeOptions:
    def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
        self.choiceOrder = choiceOrder
        self.choices = choices
        self.disableColors = disableColors
--- a/src/jobspy/scrapers/goozali/model/GoozaliResponse.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliResponse.py
@ -0,0 +1,7 @@
 from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
 class GoozaliResponse:
    def __init__(self, msg: str, data: GoozaliResponseData):
        self.msg = msg
        self.data = data
--- a/src/jobspy/scrapers/goozali/model/GoozaliRow.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliRow.py
@ -0,0 +1,8 @@
 from typing import Dict, List
 class GoozaliRow:
    def __init__(self, id: str, createdTime: str, cellValuesByColumnId: Dict[str, List[str]]):
        self.id = id
        self.createdTime = createdTime
        self.cellValuesByColumnId = cellValuesByColumnId
--- a/src/jobspy/scrapers/goozali/model/GoozaliTable.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliTable.py
@ -0,0 +1,17 @@
 from typing import Dict, List
 from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
 from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
 class GoozaliTable:
    def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
                 meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
        self.applicationId = applicationId
        self.id = id
        self.name = name
        self.columns = columns
        self.primaryColumnId = primaryColumnId
        self.meaningfulColumnOrder = meaningfulColumnOrder
        self.viewOrder = viewOrder
        self.rows = rows
--- a/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py
+++ b/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py
@ -0,0 +1,6 @@
 from jobspy.scrapers.goozali.model import GoozaliTable
 class GoozaliResponseData:
    def __init__(self, table: GoozaliTable):
        self.table = table
--- a/src/jobspy/scrapers/goozali/model/init.py
+++ b/src/jobspy/scrapers/goozali/model/init.py
--- a/src/tests/test_goozali.py
+++ b/src/tests/test_goozali.py
@ -0,0 +1,68 @@
 from jobspy import scrape_jobs
 import pandas as pd
 from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
 from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
 from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
 from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
 from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
 def test_goozali():
    result = scrape_jobs(
        site_name="glassdoor",
        search_term="engineer",
        results_wanted=5,
    )
    assert (
        isinstance(result, pd.DataFrame) and len(result) == 5
    ), "Result should be a non-empty DataFrame"
 def createMockGoozaliResponse() -> GoozaliResponse:
    data = GoozaliResponseData(table=GoozaliTable(
        applicationId="app7OQjqEzTtCRq7u",
        id="tblBQjp5Aw6O172VY",
        name="Shared view table",
        columns=[
            GoozaliColumn(
                id="fldIf9DbRpNRLJXuD",
                name="Industry",
                description=None,
                type="multiSelect",
                typeOptions=GoozaliColumnTypeOptions(
                    choiceOrder=["selcE6QUv4vWIIcZR",
                                 "sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
                    choices={
                        "selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
                        "selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
                    },
                    disableColors=False
                ),
                default=None,
                initialCreatedTime="2022-12-29T10:23:21.000Z",
                initialCreatedByUserId="usr1fVy2RIyCuGHec",
                lastModifiedTime="2024-07-21T09:30:02.000Z",
                lastModifiedByUserId="usr1fVy2RIyCuGHec",
                isEditableFromSync=False
            )
        ],
        primaryColumnId="fldLT11B0cpV6p9Uz",
        meaningfulColumnOrder=[
            {"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
            {"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
            {"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
        ],
        viewOrder=["viwNRSqqmqZLP0a3C"],
        rows=[
            GoozaliRow(
                id="recwiKgHT9mJrqoxa",
                createdTime="2023-01-09T10:32:09.000Z",
                cellValuesByColumnId={
                    "fldLT11B0cpV6p9Uz": ["3M"],
                    "fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
                }
            )
        ]
    ))
    return GoozaliResponse(msg="SUCCESS", data=data)