init commit

models ready, logic file for goozali based on the linkdin scraper
2024-12-12 17:22:58 +02:00 · 2024-12-12 17:22:58 +02:00 · 3dc15195d5
parent f0ea89b357
commit 3dc15195d5
13 changed files with 605 additions and 10 deletions
--- a/src/jobspy/main.py
+++ b/src/jobspy/main.py
@ -1,16 +1,26 @@
 import asyncio
+from enum import Enum
 from db.job_repository import JobRepository
 from jobspy import scrape_jobs
 from jobspy.telegram_bot import TelegramBot


+class Site(Enum):
+    LINKEDIN = "linkedin"
+    GOOZALI = "goozali"
+    INDEED = "indeed"
+    ZIP_RECRUITER = "zip_recruiter"
+    GLASSDOOR = "glassdoor"
+    GOOGLE = "google"
+
+
 async def main():
    telegramBot = TelegramBot()
    jobRepository = JobRepository()

    jobs = scrape_jobs(
-        # site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"],
-        site_name=["indeed"],
+        # site_name=[Site.LINKEDIN, Site.GOOZALI, Site.GLASSDOOR, Site.INDEED],
+        site_name=[Site.GOOZALI],
        search_term="software engineer",
        google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
        location="Central, Israel",
--- a/src/jobspy/scrapers/init.py
+++ b/src/jobspy/scrapers/init.py
@ -2,6 +2,8 @@ from __future__ import annotations

 from abc import ABC, abstractmethod

+from jobspy.main import Site
+
 from ..jobs import (
    Enum,
    BaseModel,
@ -12,14 +14,6 @@ from ..jobs import (
 )


-class Site(Enum):
-    LINKEDIN = "linkedin"
-    INDEED = "indeed"
-    ZIP_RECRUITER = "zip_recruiter"
-    GLASSDOOR = "glassdoor"
-    GOOGLE = "google"
-
-
 class SalarySource(Enum):
    DIRECT_DATA = "direct_data"
    DESCRIPTION = "description"
--- a/src/jobspy/scrapers/goozali/init.py
+++ b/src/jobspy/scrapers/goozali/init.py
@ -0,0 +1,439 @@
+"""
+jobspy.scrapers.Goozali
+~~~~~~~~~~~~~~~~~~~
+
+This module contains routines to scrape Goozali.
+"""
+
+from __future__ import annotations
+
+import math
+import time
+import random
+import regex as re
+from typing import Optional
+from datetime import datetime
+
+from bs4.element import Tag
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urlunparse, unquote
+from requests.exceptions import RetryError, RequestException
+from urllib3.exceptions import MaxRetryError
+from .constants import headers
+from .. import Scraper, ScraperInput, Site
+from ..exceptions import GoozaliException
+from ..utils import create_session, remove_attributes, create_logger
+from ...jobs import (
+    JobPost,
+    Location,
+    JobResponse,
+    JobType,
+    Country,
+    Compensation,
+    DescriptionFormat,
+)
+from ..utils import (
+    extract_emails_from_text,
+    get_enum_from_job_type,
+    currency_parser,
+    markdown_converter,
+)
+
+logger = create_logger("Goozali")
+
+
+class GoozaliScraper(Scraper):
+    base_url = "https://www.Goozali.com"
+    delay = 3
+    band_delay = 4
+    jobs_per_page = 25
+
+    def __init__(
+        self, proxies: list[str] | str | None = None, ca_cert: str | None = None
+    ):
+        """
+        Initializes GoozaliScraper with the Goozalijob search url
+        """
+        super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
+        self.session = create_session(
+            proxies=self.proxies,
+            ca_cert=ca_cert,
+            is_tls=False,
+            has_retry=True,
+            delay=5,
+            clear_cookies=True,
+        )
+        self.session.headers.update(headers)
+        self.scraper_input = None
+        self.country = "worldwide"
+        self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
+
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """
+        Scrapes Goozali for jobs with scraper_input criteria
+        :param scraper_input:
+        :return: job_response
+        """
+        self.scraper_input = scraper_input
+        job_list: list[JobPost] = []
+        seen_ids = set()
+        start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
+        request_count = 0
+        seconds_old = (
+            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
+        )
+        continue_search = (
+            lambda: len(
+                job_list) < scraper_input.results_wanted and start < 1000
+        )
+        for location in scraper_input.locations:
+            logger.info(f"start searching for location: {location}")
+            while continue_search():
+                request_count += 1
+                logger.info(
+                    f"search page: {
+                        request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
+                )
+                params = {
+                    "keywords": scraper_input.search_term,
+                    "location": location,
+                    "distance": scraper_input.distance,
+                    "f_WT": 2 if scraper_input.is_remote else None,
+                    "f_JT": (
+                        self.job_type_code(scraper_input.job_type)
+                        if scraper_input.job_type
+                        else None
+                    ),
+                    "pageNum": 0,
+                    "start": start,
+                    "f_AL": "true" if scraper_input.easy_apply else None,
+                    "f_C": (
+                        ",".join(map(str, scraper_input.Goozali_company_ids))
+                        if scraper_input.Goozali_company_ids
+                        else None
+                    ),
+                }
+                if seconds_old is not None:
+                    params["f_TPR"] = f"r{seconds_old}"
+
+                params = {k: v for k, v in params.items() if v is not None}
+                try:
+                    response = self.session.get(
+                        f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
+                        params=params,
+                        timeout=10,
+                    )
+                    if response.status_code not in range(200, 400):
+                        if response.status_code == 429:
+                            err = (
+                                f"429 Response - Blocked by Goozali for too many requests"
+                            )
+                        else:
+                            err = f"Goozali response status code {
+                                response.status_code}"
+                            err += f" - {response.text}"
+                        logger.error(err)
+                        return JobResponse(jobs=job_list)
+                except MaxRetryError as e:
+                    """Raised when the maximum number of retries is exceeded."""
+                    logger.error(f"RetryError: {str(e)}")
+                    logger.error(f"MaxRetryError for location: {location}")
+                    break
+                except RetryError as e:
+                    """Custom retries logic failed"""
+                    logger.error(f"RetryError: {str(e)}")
+                    logger.error(f"RetryError for location: {location}")
+                    break
+                except Exception as e:
+                    if "Proxy responded with" in str(e):
+                        logger.error(f"Goozali: Bad proxy")
+                    else:
+                        logger.error(f"Goozali: {str(e)}")
+                    return JobResponse(jobs=job_list)
+
+                soup = BeautifulSoup(response.text, "html.parser")
+                job_cards = soup.find_all("div", class_="base-search-card")
+                if len(job_cards) == 0:
+                    break
+
+                for job_card in job_cards:
+                    href_tag = job_card.find(
+                        "a", class_="base-card__full-link")
+                    if href_tag and "href" in href_tag.attrs:
+                        href = href_tag.attrs["href"].split("?")[0]
+                        job_id = href.split("-")[-1]
+
+                        if job_id in seen_ids:
+                            continue
+                        seen_ids.add(job_id)
+
+                        try:
+                            fetch_desc = scraper_input.Goozali_fetch_description
+                            job_post = self._process_job(
+                                job_card, job_id, fetch_desc)
+                            if job_post:
+                                job_list.append(job_post)
+                            if not continue_search():
+                                break
+                        except Exception as e:
+                            raise GoozaliException(str(e))
+
+                if continue_search():
+                    time.sleep(random.uniform(
+                        self.delay, self.delay + self.band_delay))
+                    start += len(job_list)
+
+        job_list = job_list[: scraper_input.results_wanted]
+        return JobResponse(jobs=job_list)
+
+    def _process_job(
+        self, job_card: Tag, job_id: str, full_descr: bool
+    ) -> Optional[JobPost]:
+        salary_tag = job_card.find(
+            "span", class_="job-search-card__salary-info")
+
+        compensation = None
+        if salary_tag:
+            salary_text = salary_tag.get_text(separator=" ").strip()
+            salary_values = [currency_parser(value)
+                             for value in salary_text.split("-")]
+            salary_min = salary_values[0]
+            salary_max = salary_values[1]
+            currency = salary_text[0] if salary_text[0] != "$" else "USD"
+
+            compensation = Compensation(
+                min_amount=int(salary_min),
+                max_amount=int(salary_max),
+                currency=currency,
+            )
+
+        title_tag = job_card.find("span", class_="sr-only")
+        title = title_tag.get_text(strip=True) if title_tag else "N/A"
+
+        company_tag = job_card.find("h4", class_="base-search-card__subtitle")
+        company_a_tag = company_tag.find("a") if company_tag else None
+        company_url = (
+            urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
+            if company_a_tag and company_a_tag.has_attr("href")
+            else ""
+        )
+        company = company_a_tag.get_text(
+            strip=True) if company_a_tag else "N/A"
+
+        metadata_card = job_card.find(
+            "div", class_="base-search-card__metadata")
+        location = self._get_location(metadata_card)
+
+        datetime_tag = (
+            metadata_card.find("time", class_="job-search-card__listdate")
+            if metadata_card
+            else None
+        )
+        date_posted = None
+        if datetime_tag and "datetime" in datetime_tag.attrs:
+            datetime_str = datetime_tag["datetime"]
+            try:
+                date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
+            except:
+                date_posted = None
+        job_details = {}
+        if full_descr:
+            job_details = self._get_job_details(job_id)
+
+        return JobPost(
+            id=f"li-{job_id}",
+            title=title,
+            company_name=company,
+            company_url=company_url,
+            location=location,
+            date_posted=date_posted,
+            datetime_posted=date_posted,
+            job_url=f"{self.base_url}/jobs/view/{job_id}",
+            compensation=compensation,
+            job_type=job_details.get("job_type"),
+            job_level=job_details.get("job_level", "").lower(),
+            company_industry=job_details.get("company_industry"),
+            description=job_details.get("description"),
+            job_url_direct=job_details.get("job_url_direct"),
+            emails=extract_emails_from_text(job_details.get("description")),
+            company_logo=job_details.get("company_logo"),
+            job_function=job_details.get("job_function"),
+        )
+
+    def _get_job_details(self, job_id: str) -> dict:
+        """
+        Retrieves job description and other job details by going to the job page url
+        :param job_page_url:
+        :return: dict
+        """
+        try:
+            response = self.session.get(
+                f"{self.base_url}/jobs/view/{job_id}", timeout=5
+            )
+            response.raise_for_status()
+        except:
+            return {}
+        if "Goozali.com/signup" in response.url:
+            return {}
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        div_content = soup.find(
+            "div", class_=lambda x: x and "show-more-less-html__markup" in x
+        )
+        description = None
+        if div_content is not None:
+            div_content = remove_attributes(div_content)
+            description = div_content.prettify(formatter="html")
+            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
+                description = markdown_converter(description)
+
+        h3_tag = soup.find(
+            "h3", text=lambda text: text and "Job function" in text.strip()
+        )
+
+        job_function = None
+        if h3_tag:
+            job_function_span = h3_tag.find_next(
+                "span", class_="description__job-criteria-text"
+            )
+            if job_function_span:
+                job_function = job_function_span.text.strip()
+
+        company_logo = (
+            logo_image.get("data-delayed-url")
+            if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
+            else None
+        )
+        return {
+            "description": description,
+            "job_level": self._parse_job_level(soup),
+            "company_industry": self._parse_company_industry(soup),
+            "job_type": self._parse_job_type(soup),
+            "job_url_direct": self._parse_job_url_direct(soup),
+            "company_logo": company_logo,
+            "job_function": job_function,
+        }
+
+    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
+        """
+        Extracts the location data from the job metadata card.
+        :param metadata_card
+        :return: location
+        """
+        location = Location(country=Country.from_string(self.country))
+        if metadata_card is not None:
+            location_tag = metadata_card.find(
+                "span", class_="job-search-card__location"
+            )
+            location_string = location_tag.text.strip() if location_tag else "N/A"
+            parts = location_string.split(", ")
+            if len(parts) == 2:
+                city, state = parts
+                location = Location(
+                    city=city,
+                    state=state,
+                    country=Country.from_string(self.country),
+                )
+            elif len(parts) == 3:
+                city, state, country = parts
+                country = Country.from_string(country)
+                location = Location(city=city, state=state, country=country)
+        return location
+
+    @staticmethod
+    def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
+        """
+        Gets the job type from job page
+        :param soup_job_type:
+        :return: JobType
+        """
+        h3_tag = soup_job_type.find(
+            "h3",
+            class_="description__job-criteria-subheader",
+            string=lambda text: "Employment type" in text,
+        )
+        employment_type = None
+        if h3_tag:
+            employment_type_span = h3_tag.find_next_sibling(
+                "span",
+                class_="description__job-criteria-text description__job-criteria-text--criteria",
+            )
+            if employment_type_span:
+                employment_type = employment_type_span.get_text(strip=True)
+                employment_type = employment_type.lower()
+                employment_type = employment_type.replace("-", "")
+
+        return [get_enum_from_job_type(employment_type)] if employment_type else []
+
+    @staticmethod
+    def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
+        """
+        Gets the job level from job page
+        :param soup_job_level:
+        :return: str
+        """
+        h3_tag = soup_job_level.find(
+            "h3",
+            class_="description__job-criteria-subheader",
+            string=lambda text: "Seniority level" in text,
+        )
+        job_level = None
+        if h3_tag:
+            job_level_span = h3_tag.find_next_sibling(
+                "span",
+                class_="description__job-criteria-text description__job-criteria-text--criteria",
+            )
+            if job_level_span:
+                job_level = job_level_span.get_text(strip=True)
+
+        return job_level
+
+    @staticmethod
+    def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
+        """
+        Gets the company industry from job page
+        :param soup_industry:
+        :return: str
+        """
+        h3_tag = soup_industry.find(
+            "h3",
+            class_="description__job-criteria-subheader",
+            string=lambda text: "Industries" in text,
+        )
+        industry = None
+        if h3_tag:
+            industry_span = h3_tag.find_next_sibling(
+                "span",
+                class_="description__job-criteria-text description__job-criteria-text--criteria",
+            )
+            if industry_span:
+                industry = industry_span.get_text(strip=True)
+
+        return industry
+
+    def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
+        """
+        Gets the job url direct from job page
+        :param soup:
+        :return: str
+        """
+        job_url_direct = None
+        job_url_direct_content = soup.find("code", id="applyUrl")
+        if job_url_direct_content:
+            job_url_direct_match = self.job_url_direct_regex.search(
+                job_url_direct_content.decode_contents().strip()
+            )
+            if job_url_direct_match:
+                job_url_direct = unquote(job_url_direct_match.group())
+
+        return job_url_direct
+
+    @staticmethod
+    def job_type_code(job_type_enum: JobType) -> str:
+        return {
+            JobType.FULL_TIME: "F",
+            JobType.PART_TIME: "P",
+            JobType.INTERNSHIP: "I",
+            JobType.CONTRACT: "C",
+            JobType.TEMPORARY: "T",
+        }.get(job_type_enum, "")
--- a/src/jobspy/scrapers/goozali/constants.py
+++ b/src/jobspy/scrapers/goozali/constants.py
@ -0,0 +1,8 @@
+headers = {
+    "authority": "www.linkedin.com",
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9",
+    "cache-control": "max-age=0",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+}
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumn.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumn.py
@ -0,0 +1,20 @@
+from typing import Optional
+
+from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions
+
+
+class GoozaliColumn:
+    def __init__(self, id: str, name: str, description: Optional[str], type: str, typeOptions: GoozaliColumnTypeOptions,
+                 default: Optional[str], initialCreatedTime: str, initialCreatedByUserId: str,
+                 lastModifiedTime: str, lastModifiedByUserId: str, isEditableFromSync: bool):
+        self.id = id
+        self.name = name
+        self.description = description
+        self.type = type
+        self.typeOptions = typeOptions
+        self.default = default
+        self.initialCreatedTime = initialCreatedTime
+        self.initialCreatedByUserId = initialCreatedByUserId
+        self.lastModifiedTime = lastModifiedTime
+        self.lastModifiedByUserId = lastModifiedByUserId
+        self.isEditableFromSync = isEditableFromSync
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumnChoice.py
@ -0,0 +1,8 @@
+from typing import Optional
+
+
+class GoozaliColumnChoice:
+    def __init__(self, id: str, name: str, color: Optional[str] = None):
+        self.id = id
+        self.name = name
+        self.color = color
--- a/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py
@ -0,0 +1,10 @@
+from typing import Dict, List
+
+from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
+
+
+class GoozaliColumnTypeOptions:
+    def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
+        self.choiceOrder = choiceOrder
+        self.choices = choices
+        self.disableColors = disableColors
--- a/src/jobspy/scrapers/goozali/model/GoozaliResponse.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliResponse.py
@ -0,0 +1,7 @@
+from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
+
+
+class GoozaliResponse:
+    def __init__(self, msg: str, data: GoozaliResponseData):
+        self.msg = msg
+        self.data = data
--- a/src/jobspy/scrapers/goozali/model/GoozaliRow.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliRow.py
@ -0,0 +1,8 @@
+from typing import Dict, List
+
+
+class GoozaliRow:
+    def __init__(self, id: str, createdTime: str, cellValuesByColumnId: Dict[str, List[str]]):
+        self.id = id
+        self.createdTime = createdTime
+        self.cellValuesByColumnId = cellValuesByColumnId
--- a/src/jobspy/scrapers/goozali/model/GoozaliTable.py
+++ b/src/jobspy/scrapers/goozali/model/GoozaliTable.py
@ -0,0 +1,17 @@
+from typing import Dict, List
+
+from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
+from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
+
+
+class GoozaliTable:
+    def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
+                 meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
+        self.applicationId = applicationId
+        self.id = id
+        self.name = name
+        self.columns = columns
+        self.primaryColumnId = primaryColumnId
+        self.meaningfulColumnOrder = meaningfulColumnOrder
+        self.viewOrder = viewOrder
+        self.rows = rows
--- a/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py
+++ b/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py
@ -0,0 +1,6 @@
+from jobspy.scrapers.goozali.model import GoozaliTable
+
+
+class GoozaliResponseData:
+    def __init__(self, table: GoozaliTable):
+        self.table = table
--- a/src/jobspy/scrapers/goozali/model/init.py
+++ b/src/jobspy/scrapers/goozali/model/init.py
--- a/src/tests/test_goozali.py
+++ b/src/tests/test_goozali.py
@ -0,0 +1,68 @@
+from jobspy import scrape_jobs
+import pandas as pd
+
+from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
+from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
+from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
+from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
+from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
+
+
+def test_goozali():
+    result = scrape_jobs(
+        site_name="glassdoor",
+        search_term="engineer",
+        results_wanted=5,
+    )
+    assert (
+        isinstance(result, pd.DataFrame) and len(result) == 5
+    ), "Result should be a non-empty DataFrame"
+
+
+def createMockGoozaliResponse() -> GoozaliResponse:
+    data = GoozaliResponseData(table=GoozaliTable(
+        applicationId="app7OQjqEzTtCRq7u",
+        id="tblBQjp5Aw6O172VY",
+        name="Shared view table",
+        columns=[
+            GoozaliColumn(
+                id="fldIf9DbRpNRLJXuD",
+                name="Industry",
+                description=None,
+                type="multiSelect",
+                typeOptions=GoozaliColumnTypeOptions(
+                    choiceOrder=["selcE6QUv4vWIIcZR",
+                                 "sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
+                    choices={
+                        "selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
+                        "selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
+                    },
+                    disableColors=False
+                ),
+                default=None,
+                initialCreatedTime="2022-12-29T10:23:21.000Z",
+                initialCreatedByUserId="usr1fVy2RIyCuGHec",
+                lastModifiedTime="2024-07-21T09:30:02.000Z",
+                lastModifiedByUserId="usr1fVy2RIyCuGHec",
+                isEditableFromSync=False
+            )
+        ],
+        primaryColumnId="fldLT11B0cpV6p9Uz",
+        meaningfulColumnOrder=[
+            {"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
+            {"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
+            {"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
+        ],
+        viewOrder=["viwNRSqqmqZLP0a3C"],
+        rows=[
+            GoozaliRow(
+                id="recwiKgHT9mJrqoxa",
+                createdTime="2023-01-09T10:32:09.000Z",
+                cellValuesByColumnId={
+                    "fldLT11B0cpV6p9Uz": ["3M"],
+                    "fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
+                }
+            )
+        ]
+    ))
+    return GoozaliResponse(msg="SUCCESS", data=data)