small changes to help testing

pull/231/head
Yariv Menachem 2024-12-15 18:40:32 +02:00
parent 00f13bdf1a
commit f02e3f7a73
7 changed files with 138 additions and 246 deletions

View File

@ -1,10 +1,15 @@
from __future__ import annotations
from datetime import datetime
from enum import Enum
import pandas as pd
from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from jobspy.scrapers.site import Site
from .scrapers.goozali import GoozaliScraper
from .jobs import JobPost, JobType, Location
from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper
@ -12,7 +17,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers import SalarySource, ScraperInput, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
IndeedException,
@ -21,6 +26,7 @@ from .scrapers.exceptions import (
GoogleJobsException,
)
def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None,
@ -55,6 +61,7 @@ def scrape_jobs(
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper,
Site.GOOZALI: GoozaliScraper,
}
set_logger_level(verbose)
@ -102,7 +109,7 @@ def scrape_jobs(
offset=offset,
hours_old=hours_old,
)
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert)
@ -113,12 +120,14 @@ def scrape_jobs(
return site.value, scraped_data
site_to_jobs_dict = {}
merged_jobs:list[JobPost] = []
merged_jobs: list[JobPost] = []
def worker(site):
site_val, scraped_info = scrape_site(site)
# Add the scraped jobs to the merged list
merged_jobs.extend(scraped_info.jobs) # Assuming scraped_info has 'jobs' as a list
# Add the scraped jobs to the merged list
# Assuming scraped_info has 'jobs' as a list
merged_jobs.extend(scraped_info.jobs)
return site_val, scraped_info
with ThreadPoolExecutor() as executor:
@ -129,8 +138,9 @@ def scrape_jobs(
for future in as_completed(future_to_site):
site_value, scraped_data = future.result()
site_to_jobs_dict[site_value] = scraped_data
return merged_jobs
def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080
@ -156,7 +166,8 @@ def scrape_jobs(
job_data["site"] = site
job_data["company"] = job_data["company_name"]
job_data["job_type"] = (
", ".join(job_type.value[0] for job_type in job_data["job_type"])
", ".join(job_type.value[0]
for job_type in job_data["job_type"])
if job_data["job_type"]
else None
)

View File

@ -1,19 +1,9 @@
import asyncio
from enum import Enum
from db.job_repository import JobRepository
from jobspy import scrape_jobs
from jobspy import Site, scrape_jobs
from jobspy.db.job_repository import JobRepository
from jobspy.telegram_bot import TelegramBot
class Site(Enum):
LINKEDIN = "linkedin"
GOOZALI = "goozali"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
async def main():
telegramBot = TelegramBot()
jobRepository = JobRepository()

View File

@ -2,7 +2,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from jobspy.main import Site
from jobspy.scrapers.site import Site
from ..jobs import (
Enum,

View File

@ -7,43 +7,19 @@ This module contains routines to scrape Goozali.
from __future__ import annotations
import math
import time
import random
import regex as re
from typing import Optional
from datetime import datetime
from jobspy.scrapers import Scraper, ScraperInput
from jobspy.scrapers.site import Site
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException
from urllib3.exceptions import MaxRetryError
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..exceptions import GoozaliException
from ..utils import create_session, remove_attributes, create_logger
from ..utils import create_session, create_logger
from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
from ...jobs import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation,
DescriptionFormat,
)
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
markdown_converter,
)
logger = create_logger("Goozali")
class GoozaliScraper(Scraper):
base_url = "https://www.Goozali.com"
delay = 3
band_delay = 4
jobs_per_page = 25
@ -54,19 +30,26 @@ class GoozaliScraper(Scraper):
"""
Initializes GoozaliScraper with the Goozalijob search url
"""
super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
super().__init__(site=Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
clear_cookies=False,
)
self.session.headers.update(headers)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
def _get_params(self, view_id: str) -> dict[str, str]:
access_policy = get_access_policy(view_id)
params = {
"stringifiedObjectParams": stringifiedObjectParams,
"request_id": request_id,
"accessPolicy": access_policy
}
return {k: v for k, v in params.items() if v is not None}
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
@ -77,189 +60,29 @@ class GoozaliScraper(Scraper):
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
# create url
# create session -> run the api
# model the response with models
# create map columnId to Column object
# filter result by Field like the web
# filter by date
# map to JobResponse Object
return JobResponse(jobs=job_list)
def _get_job_details(self, job_id: str) -> dict:
"""
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: dict
"""
try:
response = self.session.get(
f"{self.base_url}/jobs/view/{job_id}", timeout=5
)
response.raise_for_status()
except:
return {}
if "Goozali.com/signup" in response.url:
return {}
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
description = None
if div_content is not None:
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)
job_function = None
if h3_tag:
job_function_span = h3_tag.find_next(
"span", class_="description__job-criteria-text"
)
if job_function_span:
job_function = job_function_span.text.strip()
company_logo = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
)
return {
"description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"company_logo": company_logo,
"job_function": job_function,
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
location = Location(country=Country.from_string(self.country))
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
city=city,
state=state,
country=Country.from_string(self.country),
for view_id in view_ids:
# create url
url = self.base_url.format(view_id=view_id)
params = self._get_params(view_id)
# create session -> run the api
try:
response = self.session.get(
url=url,
params=params,
timeout=10,
headers=headers,
cookies=cookies
)
elif len(parts) == 3:
city, state, country = parts
country = Country.from_string(country)
location = Location(city=city, state=state, country=country)
return location
@staticmethod
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")
logger.info(f"response: {str(response)}")
if (response.status_code != 200):
logger.error(f"Status code: {
response.status_code}, Error: {str(response.text)}")
return JobResponse(jobs=job_list)
except Exception as e:
logger.error(f"Exception: {str(e)}")
# model the response with models
# create map columnId to Column object
# filter result by Field like the web
# filter by date
# map to JobResponse Object
return JobResponse(jobs=job_list)

View File

@ -1,8 +1,62 @@
view_ids = ["viwIOzPYaUGxlA0Jd"]
headers = {
"authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"x-airtable-accept-msgpack": "true",
"x-airtable-application-id": "appwewqLk7iUY4azc",
"x-requested-with": "XMLHttpRequest"
}
session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
cookies = {
"__Host-airtable-session": "eyJzZXNzaW9uSWQiOiJzZXNxdFV4bVdKRVRoVGtRMiIsImNzcmZTZWNyZXQiOiIyT0JrVTJkU2I4bDA3NFZIRmd6eTdjTHUifQ==",
"__Host-airtable-session.sig": "heWRrVH73Aa-2ALrH4c_CbvQqTNbNRv9VjPZYv3aHJ4",
"brw": "brwtN7N3OgPFrtfb2",
"brwConsent": "opt-in",
"acq": "eyJhY3F1aXNpdGlvbiI6Ilt7XCJwbGF0Zm9ybVwiOlwiZGVza3RvcFwiLFwib3JpZ2luXCI6XCJsb2dpblwiLFwidG91Y2hUaW1lXCI6XCIyMDI0LTEyLTEyVDE3OjU1OjQyLjU3OVpcIn1dIn0=",
"acq.sig": "5xrqXjip4IJZxIeSPCkajWt_wlBmGw-k7HJCj8wicxU",
"AWSALBTGCORS": "YoIaU+wibkMfutpYUIlGnvYmnUa0VjM2ukwIhESaxfQUNL+PkCcRm5MIXVI5Q+dNJn7rAfdvTlrSF8XXU7wIWQqg8DQn2+OmvFeR5uzreWH5QaRIodTZ5gVQpXK1A62oDSR18fgyIOBRza2wIiet/67JgimPxGpuecdbz2oUwr7UqifGVz0="
}
request_id = "req4q4tKw3woEEWxw&"
share_id = "shrQBuWjXd0YgPqV6"
application_id = "appwewqLk7iUY4azc"
signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
def get_access_policy(view_id: str) -> dict[str, str]:
return {
"allowedActions": [
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "readSharedViewData"
},
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "getMetadataForPrinting"
},
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "readSignedAttachmentUrls"
},
{
"modelClassName": "row",
"modelIdSelector": f"rows *[displayedInView={view_id}]",
"action": "createDocumentPreviewSession"
}
],
"shareId": share_id,
"applicationId": application_id,
"generationNumber": 0,
# "expires": "2025-01-02T00:00:00.000Z", # todo:: check how to set it
"signature": signature
}
stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}

View File

@ -0,0 +1,10 @@
from enum import Enum
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
GOOZALI = "goozali"

View File

@ -7,6 +7,9 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
# URL Example
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
def test_goozali():
result = scrape_jobs(