mirror of https://github.com/Bunsly/JobSpy
small changes to help testing
parent
00f13bdf1a
commit
f02e3f7a73
|
@ -1,10 +1,15 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
|
from .scrapers.goozali import GoozaliScraper
|
||||||
|
|
||||||
from .jobs import JobPost, JobType, Location
|
from .jobs import JobPost, JobType, Location
|
||||||
from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
from .scrapers.utils import set_logger_level, extract_salary, create_logger
|
||||||
from .scrapers.indeed import IndeedScraper
|
from .scrapers.indeed import IndeedScraper
|
||||||
|
@ -12,7 +17,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
|
||||||
from .scrapers.glassdoor import GlassdoorScraper
|
from .scrapers.glassdoor import GlassdoorScraper
|
||||||
from .scrapers.google import GoogleJobsScraper
|
from .scrapers.google import GoogleJobsScraper
|
||||||
from .scrapers.linkedin import LinkedInScraper
|
from .scrapers.linkedin import LinkedInScraper
|
||||||
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
|
from .scrapers import SalarySource, ScraperInput, JobResponse, Country
|
||||||
from .scrapers.exceptions import (
|
from .scrapers.exceptions import (
|
||||||
LinkedInException,
|
LinkedInException,
|
||||||
IndeedException,
|
IndeedException,
|
||||||
|
@ -21,6 +26,7 @@ from .scrapers.exceptions import (
|
||||||
GoogleJobsException,
|
GoogleJobsException,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def scrape_jobs(
|
def scrape_jobs(
|
||||||
site_name: str | list[str] | Site | list[Site] | None = None,
|
site_name: str | list[str] | Site | list[Site] | None = None,
|
||||||
search_term: str | None = None,
|
search_term: str | None = None,
|
||||||
|
@ -55,6 +61,7 @@ def scrape_jobs(
|
||||||
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
Site.ZIP_RECRUITER: ZipRecruiterScraper,
|
||||||
Site.GLASSDOOR: GlassdoorScraper,
|
Site.GLASSDOOR: GlassdoorScraper,
|
||||||
Site.GOOGLE: GoogleJobsScraper,
|
Site.GOOGLE: GoogleJobsScraper,
|
||||||
|
Site.GOOZALI: GoozaliScraper,
|
||||||
}
|
}
|
||||||
set_logger_level(verbose)
|
set_logger_level(verbose)
|
||||||
|
|
||||||
|
@ -114,10 +121,12 @@ def scrape_jobs(
|
||||||
|
|
||||||
site_to_jobs_dict = {}
|
site_to_jobs_dict = {}
|
||||||
merged_jobs: list[JobPost] = []
|
merged_jobs: list[JobPost] = []
|
||||||
|
|
||||||
def worker(site):
|
def worker(site):
|
||||||
site_val, scraped_info = scrape_site(site)
|
site_val, scraped_info = scrape_site(site)
|
||||||
# Add the scraped jobs to the merged list
|
# Add the scraped jobs to the merged list
|
||||||
merged_jobs.extend(scraped_info.jobs) # Assuming scraped_info has 'jobs' as a list
|
# Assuming scraped_info has 'jobs' as a list
|
||||||
|
merged_jobs.extend(scraped_info.jobs)
|
||||||
|
|
||||||
return site_val, scraped_info
|
return site_val, scraped_info
|
||||||
|
|
||||||
|
@ -131,6 +140,7 @@ def scrape_jobs(
|
||||||
site_to_jobs_dict[site_value] = scraped_data
|
site_to_jobs_dict[site_value] = scraped_data
|
||||||
|
|
||||||
return merged_jobs
|
return merged_jobs
|
||||||
|
|
||||||
def convert_to_annual(job_data: dict):
|
def convert_to_annual(job_data: dict):
|
||||||
if job_data["interval"] == "hourly":
|
if job_data["interval"] == "hourly":
|
||||||
job_data["min_amount"] *= 2080
|
job_data["min_amount"] *= 2080
|
||||||
|
@ -156,7 +166,8 @@ def scrape_jobs(
|
||||||
job_data["site"] = site
|
job_data["site"] = site
|
||||||
job_data["company"] = job_data["company_name"]
|
job_data["company"] = job_data["company_name"]
|
||||||
job_data["job_type"] = (
|
job_data["job_type"] = (
|
||||||
", ".join(job_type.value[0] for job_type in job_data["job_type"])
|
", ".join(job_type.value[0]
|
||||||
|
for job_type in job_data["job_type"])
|
||||||
if job_data["job_type"]
|
if job_data["job_type"]
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,19 +1,9 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from enum import Enum
|
from jobspy import Site, scrape_jobs
|
||||||
from db.job_repository import JobRepository
|
from jobspy.db.job_repository import JobRepository
|
||||||
from jobspy import scrape_jobs
|
|
||||||
from jobspy.telegram_bot import TelegramBot
|
from jobspy.telegram_bot import TelegramBot
|
||||||
|
|
||||||
|
|
||||||
class Site(Enum):
|
|
||||||
LINKEDIN = "linkedin"
|
|
||||||
GOOZALI = "goozali"
|
|
||||||
INDEED = "indeed"
|
|
||||||
ZIP_RECRUITER = "zip_recruiter"
|
|
||||||
GLASSDOOR = "glassdoor"
|
|
||||||
GOOGLE = "google"
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
telegramBot = TelegramBot()
|
telegramBot = TelegramBot()
|
||||||
jobRepository = JobRepository()
|
jobRepository = JobRepository()
|
||||||
|
|
|
@ -2,7 +2,8 @@ from __future__ import annotations
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
from jobspy.main import Site
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
|
|
||||||
from ..jobs import (
|
from ..jobs import (
|
||||||
Enum,
|
Enum,
|
||||||
|
|
|
@ -7,43 +7,19 @@ This module contains routines to scrape Goozali.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import math
|
from jobspy.scrapers import Scraper, ScraperInput
|
||||||
import time
|
from jobspy.scrapers.site import Site
|
||||||
import random
|
|
||||||
import regex as re
|
|
||||||
from typing import Optional
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from bs4.element import Tag
|
from ..utils import create_session, create_logger
|
||||||
from bs4 import BeautifulSoup
|
from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
|
||||||
from urllib.parse import urlparse, urlunparse, unquote
|
|
||||||
from requests.exceptions import RetryError, RequestException
|
|
||||||
from urllib3.exceptions import MaxRetryError
|
|
||||||
from .constants import headers
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
|
||||||
from ..exceptions import GoozaliException
|
|
||||||
from ..utils import create_session, remove_attributes, create_logger
|
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Location,
|
|
||||||
JobResponse,
|
JobResponse,
|
||||||
JobType,
|
|
||||||
Country,
|
|
||||||
Compensation,
|
|
||||||
DescriptionFormat,
|
|
||||||
)
|
)
|
||||||
from ..utils import (
|
|
||||||
extract_emails_from_text,
|
|
||||||
get_enum_from_job_type,
|
|
||||||
currency_parser,
|
|
||||||
markdown_converter,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = create_logger("Goozali")
|
logger = create_logger("Goozali")
|
||||||
|
|
||||||
|
|
||||||
class GoozaliScraper(Scraper):
|
class GoozaliScraper(Scraper):
|
||||||
base_url = "https://www.Goozali.com"
|
|
||||||
delay = 3
|
delay = 3
|
||||||
band_delay = 4
|
band_delay = 4
|
||||||
jobs_per_page = 25
|
jobs_per_page = 25
|
||||||
|
@ -54,19 +30,26 @@ class GoozaliScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
Initializes GoozaliScraper with the Goozalijob search url
|
Initializes GoozaliScraper with the Goozalijob search url
|
||||||
"""
|
"""
|
||||||
super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
|
super().__init__(site=Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
|
||||||
self.session = create_session(
|
self.session = create_session(
|
||||||
proxies=self.proxies,
|
proxies=self.proxies,
|
||||||
ca_cert=ca_cert,
|
ca_cert=ca_cert,
|
||||||
is_tls=False,
|
is_tls=False,
|
||||||
has_retry=True,
|
has_retry=True,
|
||||||
delay=5,
|
delay=5,
|
||||||
clear_cookies=True,
|
clear_cookies=False,
|
||||||
)
|
)
|
||||||
self.session.headers.update(headers)
|
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||||
self.scraper_input = None
|
|
||||||
self.country = "worldwide"
|
def _get_params(self, view_id: str) -> dict[str, str]:
|
||||||
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
|
access_policy = get_access_policy(view_id)
|
||||||
|
params = {
|
||||||
|
"stringifiedObjectParams": stringifiedObjectParams,
|
||||||
|
"request_id": request_id,
|
||||||
|
"accessPolicy": access_policy
|
||||||
|
}
|
||||||
|
|
||||||
|
return {k: v for k, v in params.items() if v is not None}
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
"""
|
"""
|
||||||
|
@ -77,189 +60,29 @@ class GoozaliScraper(Scraper):
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
seen_ids = set()
|
seen_ids = set()
|
||||||
|
for view_id in view_ids:
|
||||||
# create url
|
# create url
|
||||||
|
url = self.base_url.format(view_id=view_id)
|
||||||
|
params = self._get_params(view_id)
|
||||||
# create session -> run the api
|
# create session -> run the api
|
||||||
|
try:
|
||||||
|
response = self.session.get(
|
||||||
|
url=url,
|
||||||
|
params=params,
|
||||||
|
timeout=10,
|
||||||
|
headers=headers,
|
||||||
|
cookies=cookies
|
||||||
|
)
|
||||||
|
logger.info(f"response: {str(response)}")
|
||||||
|
if (response.status_code != 200):
|
||||||
|
logger.error(f"Status code: {
|
||||||
|
response.status_code}, Error: {str(response.text)}")
|
||||||
|
return JobResponse(jobs=job_list)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Exception: {str(e)}")
|
||||||
# model the response with models
|
# model the response with models
|
||||||
# create map columnId to Column object
|
# create map columnId to Column object
|
||||||
# filter result by Field like the web
|
# filter result by Field like the web
|
||||||
# filter by date
|
# filter by date
|
||||||
# map to JobResponse Object
|
# map to JobResponse Object
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
||||||
def _get_job_details(self, job_id: str) -> dict:
|
|
||||||
"""
|
|
||||||
Retrieves job description and other job details by going to the job page url
|
|
||||||
:param job_page_url:
|
|
||||||
:return: dict
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
response = self.session.get(
|
|
||||||
f"{self.base_url}/jobs/view/{job_id}", timeout=5
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
except:
|
|
||||||
return {}
|
|
||||||
if "Goozali.com/signup" in response.url:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
div_content = soup.find(
|
|
||||||
"div", class_=lambda x: x and "show-more-less-html__markup" in x
|
|
||||||
)
|
|
||||||
description = None
|
|
||||||
if div_content is not None:
|
|
||||||
div_content = remove_attributes(div_content)
|
|
||||||
description = div_content.prettify(formatter="html")
|
|
||||||
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
|
|
||||||
description = markdown_converter(description)
|
|
||||||
|
|
||||||
h3_tag = soup.find(
|
|
||||||
"h3", text=lambda text: text and "Job function" in text.strip()
|
|
||||||
)
|
|
||||||
|
|
||||||
job_function = None
|
|
||||||
if h3_tag:
|
|
||||||
job_function_span = h3_tag.find_next(
|
|
||||||
"span", class_="description__job-criteria-text"
|
|
||||||
)
|
|
||||||
if job_function_span:
|
|
||||||
job_function = job_function_span.text.strip()
|
|
||||||
|
|
||||||
company_logo = (
|
|
||||||
logo_image.get("data-delayed-url")
|
|
||||||
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"description": description,
|
|
||||||
"job_level": self._parse_job_level(soup),
|
|
||||||
"company_industry": self._parse_company_industry(soup),
|
|
||||||
"job_type": self._parse_job_type(soup),
|
|
||||||
"job_url_direct": self._parse_job_url_direct(soup),
|
|
||||||
"company_logo": company_logo,
|
|
||||||
"job_function": job_function,
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
|
|
||||||
"""
|
|
||||||
Extracts the location data from the job metadata card.
|
|
||||||
:param metadata_card
|
|
||||||
:return: location
|
|
||||||
"""
|
|
||||||
location = Location(country=Country.from_string(self.country))
|
|
||||||
if metadata_card is not None:
|
|
||||||
location_tag = metadata_card.find(
|
|
||||||
"span", class_="job-search-card__location"
|
|
||||||
)
|
|
||||||
location_string = location_tag.text.strip() if location_tag else "N/A"
|
|
||||||
parts = location_string.split(", ")
|
|
||||||
if len(parts) == 2:
|
|
||||||
city, state = parts
|
|
||||||
location = Location(
|
|
||||||
city=city,
|
|
||||||
state=state,
|
|
||||||
country=Country.from_string(self.country),
|
|
||||||
)
|
|
||||||
elif len(parts) == 3:
|
|
||||||
city, state, country = parts
|
|
||||||
country = Country.from_string(country)
|
|
||||||
location = Location(city=city, state=state, country=country)
|
|
||||||
return location
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
|
|
||||||
"""
|
|
||||||
Gets the job type from job page
|
|
||||||
:param soup_job_type:
|
|
||||||
:return: JobType
|
|
||||||
"""
|
|
||||||
h3_tag = soup_job_type.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Employment type" in text,
|
|
||||||
)
|
|
||||||
employment_type = None
|
|
||||||
if h3_tag:
|
|
||||||
employment_type_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if employment_type_span:
|
|
||||||
employment_type = employment_type_span.get_text(strip=True)
|
|
||||||
employment_type = employment_type.lower()
|
|
||||||
employment_type = employment_type.replace("-", "")
|
|
||||||
|
|
||||||
return [get_enum_from_job_type(employment_type)] if employment_type else []
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Gets the job level from job page
|
|
||||||
:param soup_job_level:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
h3_tag = soup_job_level.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Seniority level" in text,
|
|
||||||
)
|
|
||||||
job_level = None
|
|
||||||
if h3_tag:
|
|
||||||
job_level_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if job_level_span:
|
|
||||||
job_level = job_level_span.get_text(strip=True)
|
|
||||||
|
|
||||||
return job_level
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Gets the company industry from job page
|
|
||||||
:param soup_industry:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
h3_tag = soup_industry.find(
|
|
||||||
"h3",
|
|
||||||
class_="description__job-criteria-subheader",
|
|
||||||
string=lambda text: "Industries" in text,
|
|
||||||
)
|
|
||||||
industry = None
|
|
||||||
if h3_tag:
|
|
||||||
industry_span = h3_tag.find_next_sibling(
|
|
||||||
"span",
|
|
||||||
class_="description__job-criteria-text description__job-criteria-text--criteria",
|
|
||||||
)
|
|
||||||
if industry_span:
|
|
||||||
industry = industry_span.get_text(strip=True)
|
|
||||||
|
|
||||||
return industry
|
|
||||||
|
|
||||||
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
|
|
||||||
"""
|
|
||||||
Gets the job url direct from job page
|
|
||||||
:param soup:
|
|
||||||
:return: str
|
|
||||||
"""
|
|
||||||
job_url_direct = None
|
|
||||||
job_url_direct_content = soup.find("code", id="applyUrl")
|
|
||||||
if job_url_direct_content:
|
|
||||||
job_url_direct_match = self.job_url_direct_regex.search(
|
|
||||||
job_url_direct_content.decode_contents().strip()
|
|
||||||
)
|
|
||||||
if job_url_direct_match:
|
|
||||||
job_url_direct = unquote(job_url_direct_match.group())
|
|
||||||
|
|
||||||
return job_url_direct
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def job_type_code(job_type_enum: JobType) -> str:
|
|
||||||
return {
|
|
||||||
JobType.FULL_TIME: "F",
|
|
||||||
JobType.PART_TIME: "P",
|
|
||||||
JobType.INTERNSHIP: "I",
|
|
||||||
JobType.CONTRACT: "C",
|
|
||||||
JobType.TEMPORARY: "T",
|
|
||||||
}.get(job_type_enum, "")
|
|
||||||
|
|
|
@ -1,8 +1,62 @@
|
||||||
|
view_ids = ["viwIOzPYaUGxlA0Jd"]
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"authority": "www.linkedin.com",
|
"accept": "*/*",
|
||||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
"accept-language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7",
|
||||||
"accept-language": "en-US,en;q=0.9",
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||||
"cache-control": "max-age=0",
|
"x-airtable-accept-msgpack": "true",
|
||||||
"upgrade-insecure-requests": "1",
|
"x-airtable-application-id": "appwewqLk7iUY4azc",
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
"x-requested-with": "XMLHttpRequest"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
|
||||||
|
|
||||||
|
cookies = {
|
||||||
|
"__Host-airtable-session": "eyJzZXNzaW9uSWQiOiJzZXNxdFV4bVdKRVRoVGtRMiIsImNzcmZTZWNyZXQiOiIyT0JrVTJkU2I4bDA3NFZIRmd6eTdjTHUifQ==",
|
||||||
|
"__Host-airtable-session.sig": "heWRrVH73Aa-2ALrH4c_CbvQqTNbNRv9VjPZYv3aHJ4",
|
||||||
|
"brw": "brwtN7N3OgPFrtfb2",
|
||||||
|
"brwConsent": "opt-in",
|
||||||
|
"acq": "eyJhY3F1aXNpdGlvbiI6Ilt7XCJwbGF0Zm9ybVwiOlwiZGVza3RvcFwiLFwib3JpZ2luXCI6XCJsb2dpblwiLFwidG91Y2hUaW1lXCI6XCIyMDI0LTEyLTEyVDE3OjU1OjQyLjU3OVpcIn1dIn0=",
|
||||||
|
"acq.sig": "5xrqXjip4IJZxIeSPCkajWt_wlBmGw-k7HJCj8wicxU",
|
||||||
|
"AWSALBTGCORS": "YoIaU+wibkMfutpYUIlGnvYmnUa0VjM2ukwIhESaxfQUNL+PkCcRm5MIXVI5Q+dNJn7rAfdvTlrSF8XXU7wIWQqg8DQn2+OmvFeR5uzreWH5QaRIodTZ5gVQpXK1A62oDSR18fgyIOBRza2wIiet/67JgimPxGpuecdbz2oUwr7UqifGVz0="
|
||||||
|
}
|
||||||
|
|
||||||
|
request_id = "req4q4tKw3woEEWxw&"
|
||||||
|
share_id = "shrQBuWjXd0YgPqV6"
|
||||||
|
application_id = "appwewqLk7iUY4azc"
|
||||||
|
signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
|
||||||
|
|
||||||
|
|
||||||
|
def get_access_policy(view_id: str) -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
"allowedActions": [
|
||||||
|
{
|
||||||
|
"modelClassName": "view",
|
||||||
|
"modelIdSelector": view_id,
|
||||||
|
"action": "readSharedViewData"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"modelClassName": "view",
|
||||||
|
"modelIdSelector": view_id,
|
||||||
|
"action": "getMetadataForPrinting"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"modelClassName": "view",
|
||||||
|
"modelIdSelector": view_id,
|
||||||
|
"action": "readSignedAttachmentUrls"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"modelClassName": "row",
|
||||||
|
"modelIdSelector": f"rows *[displayedInView={view_id}]",
|
||||||
|
"action": "createDocumentPreviewSession"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"shareId": share_id,
|
||||||
|
"applicationId": application_id,
|
||||||
|
"generationNumber": 0,
|
||||||
|
# "expires": "2025-01-02T00:00:00.000Z", # todo:: check how to set it
|
||||||
|
"signature": signature
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class Site(Enum):
|
||||||
|
LINKEDIN = "linkedin"
|
||||||
|
INDEED = "indeed"
|
||||||
|
ZIP_RECRUITER = "zip_recruiter"
|
||||||
|
GLASSDOOR = "glassdoor"
|
||||||
|
GOOGLE = "google"
|
||||||
|
GOOZALI = "goozali"
|
|
@ -7,6 +7,9 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
|
||||||
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
|
||||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
||||||
|
|
||||||
|
# URL Example
|
||||||
|
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
||||||
|
|
||||||
|
|
||||||
def test_goozali():
|
def test_goozali():
|
||||||
result = scrape_jobs(
|
result = scrape_jobs(
|
||||||
|
|
Loading…
Reference in New Issue