small changes to help testing

2024-12-15 18:40:32 +02:00 · 2024-12-15 18:40:32 +02:00 · f02e3f7a73
parent 00f13bdf1a
commit f02e3f7a73
7 changed files with 138 additions and 246 deletions
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@ -1,10 +1,15 @@
 from __future__ import annotations
 from datetime import datetime
+from enum import Enum

 import pandas as pd
 from typing import Tuple
 from concurrent.futures import ThreadPoolExecutor, as_completed

+from jobspy.scrapers.site import Site
+
+from .scrapers.goozali import GoozaliScraper
+
 from .jobs import JobPost, JobType, Location
 from .scrapers.utils import set_logger_level, extract_salary, create_logger
 from .scrapers.indeed import IndeedScraper
@ -12,7 +17,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
 from .scrapers.glassdoor import GlassdoorScraper
 from .scrapers.google import GoogleJobsScraper
 from .scrapers.linkedin import LinkedInScraper
-from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
+from .scrapers import SalarySource, ScraperInput, JobResponse, Country
 from .scrapers.exceptions import (
    LinkedInException,
    IndeedException,
@ -21,6 +26,7 @@ from .scrapers.exceptions import (
    GoogleJobsException,
 )

+
 def scrape_jobs(
    site_name: str | list[str] | Site | list[Site] | None = None,
    search_term: str | None = None,
@ -55,6 +61,7 @@ def scrape_jobs(
        Site.ZIP_RECRUITER: ZipRecruiterScraper,
        Site.GLASSDOOR: GlassdoorScraper,
        Site.GOOGLE: GoogleJobsScraper,
+        Site.GOOZALI: GoozaliScraper,
    }
    set_logger_level(verbose)

@ -102,7 +109,7 @@ def scrape_jobs(
        offset=offset,
        hours_old=hours_old,
    )
-    
+
    def scrape_site(site: Site) -> Tuple[str, JobResponse]:
        scraper_class = SCRAPER_MAPPING[site]
        scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
@ -113,12 +120,14 @@ def scrape_jobs(
        return site.value, scraped_data

    site_to_jobs_dict = {}
-    merged_jobs:list[JobPost] = []
+    merged_jobs: list[JobPost] = []
+
    def worker(site):
        site_val, scraped_info = scrape_site(site)
-            # Add the scraped jobs to the merged list
-        merged_jobs.extend(scraped_info.jobs)  # Assuming scraped_info has 'jobs' as a list
-    
+        # Add the scraped jobs to the merged list
+        # Assuming scraped_info has 'jobs' as a list
+        merged_jobs.extend(scraped_info.jobs)
+
        return site_val, scraped_info

    with ThreadPoolExecutor() as executor:
@ -129,8 +138,9 @@ def scrape_jobs(
        for future in as_completed(future_to_site):
            site_value, scraped_data = future.result()
            site_to_jobs_dict[site_value] = scraped_data
-    
+
    return merged_jobs
+
    def convert_to_annual(job_data: dict):
        if job_data["interval"] == "hourly":
            job_data["min_amount"] *= 2080
@ -156,7 +166,8 @@ def scrape_jobs(
            job_data["site"] = site
            job_data["company"] = job_data["company_name"]
            job_data["job_type"] = (
-                ", ".join(job_type.value[0] for job_type in job_data["job_type"])
+                ", ".join(job_type.value[0]
+                          for job_type in job_data["job_type"])
                if job_data["job_type"]
                else None
            )
--- a/src/jobspy/main.py
+++ b/src/jobspy/main.py
@ -1,19 +1,9 @@
 import asyncio
-from enum import Enum
-from db.job_repository import JobRepository
-from jobspy import scrape_jobs
+from jobspy import Site, scrape_jobs
+from jobspy.db.job_repository import JobRepository
 from jobspy.telegram_bot import TelegramBot


-class Site(Enum):
-    LINKEDIN = "linkedin"
-    GOOZALI = "goozali"
-    INDEED = "indeed"
-    ZIP_RECRUITER = "zip_recruiter"
-    GLASSDOOR = "glassdoor"
-    GOOGLE = "google"
-
-
 async def main():
    telegramBot = TelegramBot()
    jobRepository = JobRepository()
--- a/src/jobspy/scrapers/init.py
+++ b/src/jobspy/scrapers/init.py
@ -2,7 +2,8 @@ from __future__ import annotations

 from abc import ABC, abstractmethod

-from jobspy.main import Site
+from jobspy.scrapers.site import Site
+

 from ..jobs import (
    Enum,
--- a/src/jobspy/scrapers/goozali/init.py
+++ b/src/jobspy/scrapers/goozali/init.py
@ -7,43 +7,19 @@ This module contains routines to scrape Goozali.

 from __future__ import annotations

-import math
-import time
-import random
-import regex as re
-from typing import Optional
-from datetime import datetime
+from jobspy.scrapers import Scraper, ScraperInput
+from jobspy.scrapers.site import Site

-from bs4.element import Tag
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse, urlunparse, unquote
-from requests.exceptions import RetryError, RequestException
-from urllib3.exceptions import MaxRetryError
-from .constants import headers
-from .. import Scraper, ScraperInput, Site
-from ..exceptions import GoozaliException
-from ..utils import create_session, remove_attributes, create_logger
+from ..utils import create_session, create_logger
+from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
 from ...jobs import (
    JobPost,
-    Location,
    JobResponse,
-    JobType,
-    Country,
-    Compensation,
-    DescriptionFormat,
 )
-from ..utils import (
-    extract_emails_from_text,
-    get_enum_from_job_type,
-    currency_parser,
-    markdown_converter,
-)
-
 logger = create_logger("Goozali")


 class GoozaliScraper(Scraper):
-    base_url = "https://www.Goozali.com"
    delay = 3
    band_delay = 4
    jobs_per_page = 25
@ -54,19 +30,26 @@ class GoozaliScraper(Scraper):
        """
        Initializes GoozaliScraper with the Goozalijob search url
        """
-        super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
+        super().__init__(site=Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
        self.session = create_session(
            proxies=self.proxies,
            ca_cert=ca_cert,
            is_tls=False,
            has_retry=True,
            delay=5,
-            clear_cookies=True,
+            clear_cookies=False,
        )
-        self.session.headers.update(headers)
-        self.scraper_input = None
-        self.country = "worldwide"
-        self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
+        self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
+
+    def _get_params(self, view_id: str) -> dict[str, str]:
+        access_policy = get_access_policy(view_id)
+        params = {
+            "stringifiedObjectParams": stringifiedObjectParams,
+            "request_id": request_id,
+            "accessPolicy": access_policy
+        }
+
+        return {k: v for k, v in params.items() if v is not None}

    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
        """
@ -77,189 +60,29 @@ class GoozaliScraper(Scraper):
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
        seen_ids = set()
-        # create url
-        # create session -> run the api
-        # model the response with models
-        # create map columnId to Column object
-        # filter result by Field like the web
-        # filter by date
-        # map to JobResponse Object
-        return JobResponse(jobs=job_list)
-
-    def _get_job_details(self, job_id: str) -> dict:
-        """
-        Retrieves job description and other job details by going to the job page url
-        :param job_page_url:
-        :return: dict
-        """
-        try:
-            response = self.session.get(
-                f"{self.base_url}/jobs/view/{job_id}", timeout=5
-            )
-            response.raise_for_status()
-        except:
-            return {}
-        if "Goozali.com/signup" in response.url:
-            return {}
-
-        soup = BeautifulSoup(response.text, "html.parser")
-        div_content = soup.find(
-            "div", class_=lambda x: x and "show-more-less-html__markup" in x
-        )
-        description = None
-        if div_content is not None:
-            div_content = remove_attributes(div_content)
-            description = div_content.prettify(formatter="html")
-            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
-                description = markdown_converter(description)
-
-        h3_tag = soup.find(
-            "h3", text=lambda text: text and "Job function" in text.strip()
-        )
-
-        job_function = None
-        if h3_tag:
-            job_function_span = h3_tag.find_next(
-                "span", class_="description__job-criteria-text"
-            )
-            if job_function_span:
-                job_function = job_function_span.text.strip()
-
-        company_logo = (
-            logo_image.get("data-delayed-url")
-            if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
-            else None
-        )
-        return {
-            "description": description,
-            "job_level": self._parse_job_level(soup),
-            "company_industry": self._parse_company_industry(soup),
-            "job_type": self._parse_job_type(soup),
-            "job_url_direct": self._parse_job_url_direct(soup),
-            "company_logo": company_logo,
-            "job_function": job_function,
-        }
-
-    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
-        """
-        Extracts the location data from the job metadata card.
-        :param metadata_card
-        :return: location
-        """
-        location = Location(country=Country.from_string(self.country))
-        if metadata_card is not None:
-            location_tag = metadata_card.find(
-                "span", class_="job-search-card__location"
-            )
-            location_string = location_tag.text.strip() if location_tag else "N/A"
-            parts = location_string.split(", ")
-            if len(parts) == 2:
-                city, state = parts
-                location = Location(
-                    city=city,
-                    state=state,
-                    country=Country.from_string(self.country),
+        for view_id in view_ids:
+            # create url
+            url = self.base_url.format(view_id=view_id)
+            params = self._get_params(view_id)
+            # create session -> run the api
+            try:
+                response = self.session.get(
+                    url=url,
+                    params=params,
+                    timeout=10,
+                    headers=headers,
+                    cookies=cookies
                )
-            elif len(parts) == 3:
-                city, state, country = parts
-                country = Country.from_string(country)
-                location = Location(city=city, state=state, country=country)
-        return location
-
-    @staticmethod
-    def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
-        """
-        Gets the job type from job page
-        :param soup_job_type:
-        :return: JobType
-        """
-        h3_tag = soup_job_type.find(
-            "h3",
-            class_="description__job-criteria-subheader",
-            string=lambda text: "Employment type" in text,
-        )
-        employment_type = None
-        if h3_tag:
-            employment_type_span = h3_tag.find_next_sibling(
-                "span",
-                class_="description__job-criteria-text description__job-criteria-text--criteria",
-            )
-            if employment_type_span:
-                employment_type = employment_type_span.get_text(strip=True)
-                employment_type = employment_type.lower()
-                employment_type = employment_type.replace("-", "")
-
-        return [get_enum_from_job_type(employment_type)] if employment_type else []
-
-    @staticmethod
-    def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
-        """
-        Gets the job level from job page
-        :param soup_job_level:
-        :return: str
-        """
-        h3_tag = soup_job_level.find(
-            "h3",
-            class_="description__job-criteria-subheader",
-            string=lambda text: "Seniority level" in text,
-        )
-        job_level = None
-        if h3_tag:
-            job_level_span = h3_tag.find_next_sibling(
-                "span",
-                class_="description__job-criteria-text description__job-criteria-text--criteria",
-            )
-            if job_level_span:
-                job_level = job_level_span.get_text(strip=True)
-
-        return job_level
-
-    @staticmethod
-    def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
-        """
-        Gets the company industry from job page
-        :param soup_industry:
-        :return: str
-        """
-        h3_tag = soup_industry.find(
-            "h3",
-            class_="description__job-criteria-subheader",
-            string=lambda text: "Industries" in text,
-        )
-        industry = None
-        if h3_tag:
-            industry_span = h3_tag.find_next_sibling(
-                "span",
-                class_="description__job-criteria-text description__job-criteria-text--criteria",
-            )
-            if industry_span:
-                industry = industry_span.get_text(strip=True)
-
-        return industry
-
-    def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
-        """
-        Gets the job url direct from job page
-        :param soup:
-        :return: str
-        """
-        job_url_direct = None
-        job_url_direct_content = soup.find("code", id="applyUrl")
-        if job_url_direct_content:
-            job_url_direct_match = self.job_url_direct_regex.search(
-                job_url_direct_content.decode_contents().strip()
-            )
-            if job_url_direct_match:
-                job_url_direct = unquote(job_url_direct_match.group())
-
-        return job_url_direct
-
-    @staticmethod
-    def job_type_code(job_type_enum: JobType) -> str:
-        return {
-            JobType.FULL_TIME: "F",
-            JobType.PART_TIME: "P",
-            JobType.INTERNSHIP: "I",
-            JobType.CONTRACT: "C",
-            JobType.TEMPORARY: "T",
-        }.get(job_type_enum, "")
+                logger.info(f"response: {str(response)}")
+                if (response.status_code != 200):
+                    logger.error(f"Status code: {
+                                 response.status_code}, Error: {str(response.text)}")
+                    return JobResponse(jobs=job_list)
+            except Exception as e:
+                logger.error(f"Exception: {str(e)}")
+            # model the response with models
+            # create map columnId to Column object
+            # filter result by Field like the web
+            # filter by date
+            # map to JobResponse Object
+        return JobResponse(jobs=job_list)
--- a/src/jobspy/scrapers/goozali/constants.py
+++ b/src/jobspy/scrapers/goozali/constants.py
@ -1,8 +1,62 @@
+view_ids = ["viwIOzPYaUGxlA0Jd"]
+
 headers = {
-    "authority": "www.linkedin.com",
-    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-    "accept-language": "en-US,en;q=0.9",
-    "cache-control": "max-age=0",
-    "upgrade-insecure-requests": "1",
-    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    "accept": "*/*",
+    "accept-language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+    "x-airtable-accept-msgpack": "true",
+    "x-airtable-application-id": "appwewqLk7iUY4azc",
+    "x-requested-with": "XMLHttpRequest"
 }
+
+session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
+
+cookies = {
+    "__Host-airtable-session": "eyJzZXNzaW9uSWQiOiJzZXNxdFV4bVdKRVRoVGtRMiIsImNzcmZTZWNyZXQiOiIyT0JrVTJkU2I4bDA3NFZIRmd6eTdjTHUifQ==",
+    "__Host-airtable-session.sig": "heWRrVH73Aa-2ALrH4c_CbvQqTNbNRv9VjPZYv3aHJ4",
+    "brw": "brwtN7N3OgPFrtfb2",
+    "brwConsent": "opt-in",
+    "acq": "eyJhY3F1aXNpdGlvbiI6Ilt7XCJwbGF0Zm9ybVwiOlwiZGVza3RvcFwiLFwib3JpZ2luXCI6XCJsb2dpblwiLFwidG91Y2hUaW1lXCI6XCIyMDI0LTEyLTEyVDE3OjU1OjQyLjU3OVpcIn1dIn0=",
+    "acq.sig": "5xrqXjip4IJZxIeSPCkajWt_wlBmGw-k7HJCj8wicxU",
+    "AWSALBTGCORS": "YoIaU+wibkMfutpYUIlGnvYmnUa0VjM2ukwIhESaxfQUNL+PkCcRm5MIXVI5Q+dNJn7rAfdvTlrSF8XXU7wIWQqg8DQn2+OmvFeR5uzreWH5QaRIodTZ5gVQpXK1A62oDSR18fgyIOBRza2wIiet/67JgimPxGpuecdbz2oUwr7UqifGVz0="
+}
+
+request_id = "req4q4tKw3woEEWxw&"
+share_id = "shrQBuWjXd0YgPqV6"
+application_id = "appwewqLk7iUY4azc"
+signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
+
+
+def get_access_policy(view_id: str) -> dict[str, str]:
+    return {
+        "allowedActions": [
+           {
+               "modelClassName": "view",
+               "modelIdSelector": view_id,
+               "action": "readSharedViewData"
+           },
+            {
+               "modelClassName": "view",
+               "modelIdSelector": view_id,
+               "action": "getMetadataForPrinting"
+           },
+            {
+               "modelClassName": "view",
+               "modelIdSelector": view_id,
+               "action": "readSignedAttachmentUrls"
+           },
+            {
+               "modelClassName": "row",
+               "modelIdSelector": f"rows *[displayedInView={view_id}]",
+               "action": "createDocumentPreviewSession"
+           }
+        ],
+        "shareId": share_id,
+        "applicationId": application_id,
+        "generationNumber": 0,
+        # "expires": "2025-01-02T00:00:00.000Z",  # todo:: check how to set it
+        "signature": signature
+    }
+
+
+stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}
--- a/src/jobspy/scrapers/site.py
+++ b/src/jobspy/scrapers/site.py
@ -0,0 +1,10 @@
+from enum import Enum
+
+
+class Site(Enum):
+    LINKEDIN = "linkedin"
+    INDEED = "indeed"
+    ZIP_RECRUITER = "zip_recruiter"
+    GLASSDOOR = "glassdoor"
+    GOOGLE = "google"
+    GOOZALI = "goozali"
--- a/src/tests/test_goozali.py
+++ b/src/tests/test_goozali.py
@ -7,6 +7,9 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
 from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
 from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData

+# URL Example
+# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
+

 def test_goozali():
    result = scrape_jobs(