diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index c0a6a90..c4645d1 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,10 +1,15 @@ from __future__ import annotations from datetime import datetime +from enum import Enum import pandas as pd from typing import Tuple from concurrent.futures import ThreadPoolExecutor, as_completed +from jobspy.scrapers.site import Site + +from .scrapers.goozali import GoozaliScraper + from .jobs import JobPost, JobType, Location from .scrapers.utils import set_logger_level, extract_salary, create_logger from .scrapers.indeed import IndeedScraper @@ -12,7 +17,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper from .scrapers.glassdoor import GlassdoorScraper from .scrapers.google import GoogleJobsScraper from .scrapers.linkedin import LinkedInScraper -from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country +from .scrapers import SalarySource, ScraperInput, JobResponse, Country from .scrapers.exceptions import ( LinkedInException, IndeedException, @@ -21,6 +26,7 @@ from .scrapers.exceptions import ( GoogleJobsException, ) + def scrape_jobs( site_name: str | list[str] | Site | list[Site] | None = None, search_term: str | None = None, @@ -55,6 +61,7 @@ def scrape_jobs( Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.GLASSDOOR: GlassdoorScraper, Site.GOOGLE: GoogleJobsScraper, + Site.GOOZALI: GoozaliScraper, } set_logger_level(verbose) @@ -102,7 +109,7 @@ def scrape_jobs( offset=offset, hours_old=hours_old, ) - + def scrape_site(site: Site) -> Tuple[str, JobResponse]: scraper_class = SCRAPER_MAPPING[site] scraper = scraper_class(proxies=proxies, ca_cert=ca_cert) @@ -113,12 +120,14 @@ def scrape_jobs( return site.value, scraped_data site_to_jobs_dict = {} - merged_jobs:list[JobPost] = [] + merged_jobs: list[JobPost] = [] + def worker(site): site_val, scraped_info = scrape_site(site) - # Add the scraped jobs to the merged list - merged_jobs.extend(scraped_info.jobs) # Assuming scraped_info has 'jobs' as a list - + # Add the scraped jobs to the merged list + # Assuming scraped_info has 'jobs' as a list + merged_jobs.extend(scraped_info.jobs) + return site_val, scraped_info with ThreadPoolExecutor() as executor: @@ -129,8 +138,9 @@ def scrape_jobs( for future in as_completed(future_to_site): site_value, scraped_data = future.result() site_to_jobs_dict[site_value] = scraped_data - + return merged_jobs + def convert_to_annual(job_data: dict): if job_data["interval"] == "hourly": job_data["min_amount"] *= 2080 @@ -156,7 +166,8 @@ def scrape_jobs( job_data["site"] = site job_data["company"] = job_data["company_name"] job_data["job_type"] = ( - ", ".join(job_type.value[0] for job_type in job_data["job_type"]) + ", ".join(job_type.value[0] + for job_type in job_data["job_type"]) if job_data["job_type"] else None ) diff --git a/src/jobspy/main.py b/src/jobspy/main.py index b7c9196..7966b1e 100644 --- a/src/jobspy/main.py +++ b/src/jobspy/main.py @@ -1,19 +1,9 @@ import asyncio -from enum import Enum -from db.job_repository import JobRepository -from jobspy import scrape_jobs +from jobspy import Site, scrape_jobs +from jobspy.db.job_repository import JobRepository from jobspy.telegram_bot import TelegramBot -class Site(Enum): - LINKEDIN = "linkedin" - GOOZALI = "goozali" - INDEED = "indeed" - ZIP_RECRUITER = "zip_recruiter" - GLASSDOOR = "glassdoor" - GOOGLE = "google" - - async def main(): telegramBot = TelegramBot() jobRepository = JobRepository() diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 8f271e0..861d269 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -2,7 +2,8 @@ from __future__ import annotations from abc import ABC, abstractmethod -from jobspy.main import Site +from jobspy.scrapers.site import Site + from ..jobs import ( Enum, diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index 6e37b83..5b4a44e 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -7,43 +7,19 @@ This module contains routines to scrape Goozali. from __future__ import annotations -import math -import time -import random -import regex as re -from typing import Optional -from datetime import datetime +from jobspy.scrapers import Scraper, ScraperInput +from jobspy.scrapers.site import Site -from bs4.element import Tag -from bs4 import BeautifulSoup -from urllib.parse import urlparse, urlunparse, unquote -from requests.exceptions import RetryError, RequestException -from urllib3.exceptions import MaxRetryError -from .constants import headers -from .. import Scraper, ScraperInput, Site -from ..exceptions import GoozaliException -from ..utils import create_session, remove_attributes, create_logger +from ..utils import create_session, create_logger +from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids from ...jobs import ( JobPost, - Location, JobResponse, - JobType, - Country, - Compensation, - DescriptionFormat, ) -from ..utils import ( - extract_emails_from_text, - get_enum_from_job_type, - currency_parser, - markdown_converter, -) - logger = create_logger("Goozali") class GoozaliScraper(Scraper): - base_url = "https://www.Goozali.com" delay = 3 band_delay = 4 jobs_per_page = 25 @@ -54,19 +30,26 @@ class GoozaliScraper(Scraper): """ Initializes GoozaliScraper with the Goozalijob search url """ - super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert) + super().__init__(site=Site.GOOZALI, proxies=proxies, ca_cert=ca_cert) self.session = create_session( proxies=self.proxies, ca_cert=ca_cert, is_tls=False, has_retry=True, delay=5, - clear_cookies=True, + clear_cookies=False, ) - self.session.headers.update(headers) - self.scraper_input = None - self.country = "worldwide" - self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') + self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData" + + def _get_params(self, view_id: str) -> dict[str, str]: + access_policy = get_access_policy(view_id) + params = { + "stringifiedObjectParams": stringifiedObjectParams, + "request_id": request_id, + "accessPolicy": access_policy + } + + return {k: v for k, v in params.items() if v is not None} def scrape(self, scraper_input: ScraperInput) -> JobResponse: """ @@ -77,189 +60,29 @@ class GoozaliScraper(Scraper): self.scraper_input = scraper_input job_list: list[JobPost] = [] seen_ids = set() - # create url - # create session -> run the api - # model the response with models - # create map columnId to Column object - # filter result by Field like the web - # filter by date - # map to JobResponse Object - return JobResponse(jobs=job_list) - - def _get_job_details(self, job_id: str) -> dict: - """ - Retrieves job description and other job details by going to the job page url - :param job_page_url: - :return: dict - """ - try: - response = self.session.get( - f"{self.base_url}/jobs/view/{job_id}", timeout=5 - ) - response.raise_for_status() - except: - return {} - if "Goozali.com/signup" in response.url: - return {} - - soup = BeautifulSoup(response.text, "html.parser") - div_content = soup.find( - "div", class_=lambda x: x and "show-more-less-html__markup" in x - ) - description = None - if div_content is not None: - div_content = remove_attributes(div_content) - description = div_content.prettify(formatter="html") - if self.scraper_input.description_format == DescriptionFormat.MARKDOWN: - description = markdown_converter(description) - - h3_tag = soup.find( - "h3", text=lambda text: text and "Job function" in text.strip() - ) - - job_function = None - if h3_tag: - job_function_span = h3_tag.find_next( - "span", class_="description__job-criteria-text" - ) - if job_function_span: - job_function = job_function_span.text.strip() - - company_logo = ( - logo_image.get("data-delayed-url") - if (logo_image := soup.find("img", {"class": "artdeco-entity-image"})) - else None - ) - return { - "description": description, - "job_level": self._parse_job_level(soup), - "company_industry": self._parse_company_industry(soup), - "job_type": self._parse_job_type(soup), - "job_url_direct": self._parse_job_url_direct(soup), - "company_logo": company_logo, - "job_function": job_function, - } - - def _get_location(self, metadata_card: Optional[Tag]) -> Location: - """ - Extracts the location data from the job metadata card. - :param metadata_card - :return: location - """ - location = Location(country=Country.from_string(self.country)) - if metadata_card is not None: - location_tag = metadata_card.find( - "span", class_="job-search-card__location" - ) - location_string = location_tag.text.strip() if location_tag else "N/A" - parts = location_string.split(", ") - if len(parts) == 2: - city, state = parts - location = Location( - city=city, - state=state, - country=Country.from_string(self.country), + for view_id in view_ids: + # create url + url = self.base_url.format(view_id=view_id) + params = self._get_params(view_id) + # create session -> run the api + try: + response = self.session.get( + url=url, + params=params, + timeout=10, + headers=headers, + cookies=cookies ) - elif len(parts) == 3: - city, state, country = parts - country = Country.from_string(country) - location = Location(city=city, state=state, country=country) - return location - - @staticmethod - def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None: - """ - Gets the job type from job page - :param soup_job_type: - :return: JobType - """ - h3_tag = soup_job_type.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Employment type" in text, - ) - employment_type = None - if h3_tag: - employment_type_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if employment_type_span: - employment_type = employment_type_span.get_text(strip=True) - employment_type = employment_type.lower() - employment_type = employment_type.replace("-", "") - - return [get_enum_from_job_type(employment_type)] if employment_type else [] - - @staticmethod - def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None: - """ - Gets the job level from job page - :param soup_job_level: - :return: str - """ - h3_tag = soup_job_level.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Seniority level" in text, - ) - job_level = None - if h3_tag: - job_level_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if job_level_span: - job_level = job_level_span.get_text(strip=True) - - return job_level - - @staticmethod - def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None: - """ - Gets the company industry from job page - :param soup_industry: - :return: str - """ - h3_tag = soup_industry.find( - "h3", - class_="description__job-criteria-subheader", - string=lambda text: "Industries" in text, - ) - industry = None - if h3_tag: - industry_span = h3_tag.find_next_sibling( - "span", - class_="description__job-criteria-text description__job-criteria-text--criteria", - ) - if industry_span: - industry = industry_span.get_text(strip=True) - - return industry - - def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None: - """ - Gets the job url direct from job page - :param soup: - :return: str - """ - job_url_direct = None - job_url_direct_content = soup.find("code", id="applyUrl") - if job_url_direct_content: - job_url_direct_match = self.job_url_direct_regex.search( - job_url_direct_content.decode_contents().strip() - ) - if job_url_direct_match: - job_url_direct = unquote(job_url_direct_match.group()) - - return job_url_direct - - @staticmethod - def job_type_code(job_type_enum: JobType) -> str: - return { - JobType.FULL_TIME: "F", - JobType.PART_TIME: "P", - JobType.INTERNSHIP: "I", - JobType.CONTRACT: "C", - JobType.TEMPORARY: "T", - }.get(job_type_enum, "") + logger.info(f"response: {str(response)}") + if (response.status_code != 200): + logger.error(f"Status code: { + response.status_code}, Error: {str(response.text)}") + return JobResponse(jobs=job_list) + except Exception as e: + logger.error(f"Exception: {str(e)}") + # model the response with models + # create map columnId to Column object + # filter result by Field like the web + # filter by date + # map to JobResponse Object + return JobResponse(jobs=job_list) diff --git a/src/jobspy/scrapers/goozali/constants.py b/src/jobspy/scrapers/goozali/constants.py index 6123058..4fe3af6 100644 --- a/src/jobspy/scrapers/goozali/constants.py +++ b/src/jobspy/scrapers/goozali/constants.py @@ -1,8 +1,62 @@ +view_ids = ["viwIOzPYaUGxlA0Jd"] + headers = { - "authority": "www.linkedin.com", - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "accept-language": "en-US,en;q=0.9", - "cache-control": "max-age=0", - "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "x-airtable-accept-msgpack": "true", + "x-airtable-application-id": "appwewqLk7iUY4azc", + "x-requested-with": "XMLHttpRequest" } + +session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E=" + +cookies = { + "__Host-airtable-session": "eyJzZXNzaW9uSWQiOiJzZXNxdFV4bVdKRVRoVGtRMiIsImNzcmZTZWNyZXQiOiIyT0JrVTJkU2I4bDA3NFZIRmd6eTdjTHUifQ==", + "__Host-airtable-session.sig": "heWRrVH73Aa-2ALrH4c_CbvQqTNbNRv9VjPZYv3aHJ4", + "brw": "brwtN7N3OgPFrtfb2", + "brwConsent": "opt-in", + "acq": "eyJhY3F1aXNpdGlvbiI6Ilt7XCJwbGF0Zm9ybVwiOlwiZGVza3RvcFwiLFwib3JpZ2luXCI6XCJsb2dpblwiLFwidG91Y2hUaW1lXCI6XCIyMDI0LTEyLTEyVDE3OjU1OjQyLjU3OVpcIn1dIn0=", + "acq.sig": "5xrqXjip4IJZxIeSPCkajWt_wlBmGw-k7HJCj8wicxU", + "AWSALBTGCORS": "YoIaU+wibkMfutpYUIlGnvYmnUa0VjM2ukwIhESaxfQUNL+PkCcRm5MIXVI5Q+dNJn7rAfdvTlrSF8XXU7wIWQqg8DQn2+OmvFeR5uzreWH5QaRIodTZ5gVQpXK1A62oDSR18fgyIOBRza2wIiet/67JgimPxGpuecdbz2oUwr7UqifGVz0=" +} + +request_id = "req4q4tKw3woEEWxw&" +share_id = "shrQBuWjXd0YgPqV6" +application_id = "appwewqLk7iUY4azc" +signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" + + +def get_access_policy(view_id: str) -> dict[str, str]: + return { + "allowedActions": [ + { + "modelClassName": "view", + "modelIdSelector": view_id, + "action": "readSharedViewData" + }, + { + "modelClassName": "view", + "modelIdSelector": view_id, + "action": "getMetadataForPrinting" + }, + { + "modelClassName": "view", + "modelIdSelector": view_id, + "action": "readSignedAttachmentUrls" + }, + { + "modelClassName": "row", + "modelIdSelector": f"rows *[displayedInView={view_id}]", + "action": "createDocumentPreviewSession" + } + ], + "shareId": share_id, + "applicationId": application_id, + "generationNumber": 0, + # "expires": "2025-01-02T00:00:00.000Z", # todo:: check how to set it + "signature": signature + } + + +stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"} diff --git a/src/jobspy/scrapers/site.py b/src/jobspy/scrapers/site.py new file mode 100644 index 0000000..1ad5aeb --- /dev/null +++ b/src/jobspy/scrapers/site.py @@ -0,0 +1,10 @@ +from enum import Enum + + +class Site(Enum): + LINKEDIN = "linkedin" + INDEED = "indeed" + ZIP_RECRUITER = "zip_recruiter" + GLASSDOOR = "glassdoor" + GOOGLE = "google" + GOOZALI = "goozali" diff --git a/src/tests/test_goozali.py b/src/tests/test_goozali.py index 79edd7c..027e421 100644 --- a/src/tests/test_goozali.py +++ b/src/tests/test_goozali.py @@ -7,6 +7,9 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData +# URL Example +# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D + def test_goozali(): result = scrape_jobs(