small changes to help testing

pull/231/head
Yariv Menachem 2024-12-15 18:40:32 +02:00
parent 00f13bdf1a
commit f02e3f7a73
7 changed files with 138 additions and 246 deletions

View File

@ -1,10 +1,15 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime from datetime import datetime
from enum import Enum
import pandas as pd import pandas as pd
from typing import Tuple from typing import Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from jobspy.scrapers.site import Site
from .scrapers.goozali import GoozaliScraper
from .jobs import JobPost, JobType, Location from .jobs import JobPost, JobType, Location
from .scrapers.utils import set_logger_level, extract_salary, create_logger from .scrapers.utils import set_logger_level, extract_salary, create_logger
from .scrapers.indeed import IndeedScraper from .scrapers.indeed import IndeedScraper
@ -12,7 +17,7 @@ from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.google import GoogleJobsScraper from .scrapers.google import GoogleJobsScraper
from .scrapers.linkedin import LinkedInScraper from .scrapers.linkedin import LinkedInScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country from .scrapers import SalarySource, ScraperInput, JobResponse, Country
from .scrapers.exceptions import ( from .scrapers.exceptions import (
LinkedInException, LinkedInException,
IndeedException, IndeedException,
@ -21,6 +26,7 @@ from .scrapers.exceptions import (
GoogleJobsException, GoogleJobsException,
) )
def scrape_jobs( def scrape_jobs(
site_name: str | list[str] | Site | list[Site] | None = None, site_name: str | list[str] | Site | list[Site] | None = None,
search_term: str | None = None, search_term: str | None = None,
@ -55,6 +61,7 @@ def scrape_jobs(
Site.ZIP_RECRUITER: ZipRecruiterScraper, Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper, Site.GLASSDOOR: GlassdoorScraper,
Site.GOOGLE: GoogleJobsScraper, Site.GOOGLE: GoogleJobsScraper,
Site.GOOZALI: GoozaliScraper,
} }
set_logger_level(verbose) set_logger_level(verbose)
@ -114,10 +121,12 @@ def scrape_jobs(
site_to_jobs_dict = {} site_to_jobs_dict = {}
merged_jobs: list[JobPost] = [] merged_jobs: list[JobPost] = []
def worker(site): def worker(site):
site_val, scraped_info = scrape_site(site) site_val, scraped_info = scrape_site(site)
# Add the scraped jobs to the merged list # Add the scraped jobs to the merged list
merged_jobs.extend(scraped_info.jobs) # Assuming scraped_info has 'jobs' as a list # Assuming scraped_info has 'jobs' as a list
merged_jobs.extend(scraped_info.jobs)
return site_val, scraped_info return site_val, scraped_info
@ -131,6 +140,7 @@ def scrape_jobs(
site_to_jobs_dict[site_value] = scraped_data site_to_jobs_dict[site_value] = scraped_data
return merged_jobs return merged_jobs
def convert_to_annual(job_data: dict): def convert_to_annual(job_data: dict):
if job_data["interval"] == "hourly": if job_data["interval"] == "hourly":
job_data["min_amount"] *= 2080 job_data["min_amount"] *= 2080
@ -156,7 +166,8 @@ def scrape_jobs(
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
job_data["job_type"] = ( job_data["job_type"] = (
", ".join(job_type.value[0] for job_type in job_data["job_type"]) ", ".join(job_type.value[0]
for job_type in job_data["job_type"])
if job_data["job_type"] if job_data["job_type"]
else None else None
) )

View File

@ -1,19 +1,9 @@
import asyncio import asyncio
from enum import Enum from jobspy import Site, scrape_jobs
from db.job_repository import JobRepository from jobspy.db.job_repository import JobRepository
from jobspy import scrape_jobs
from jobspy.telegram_bot import TelegramBot from jobspy.telegram_bot import TelegramBot
class Site(Enum):
LINKEDIN = "linkedin"
GOOZALI = "goozali"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
async def main(): async def main():
telegramBot = TelegramBot() telegramBot = TelegramBot()
jobRepository = JobRepository() jobRepository = JobRepository()

View File

@ -2,7 +2,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from jobspy.main import Site from jobspy.scrapers.site import Site
from ..jobs import ( from ..jobs import (
Enum, Enum,

View File

@ -7,43 +7,19 @@ This module contains routines to scrape Goozali.
from __future__ import annotations from __future__ import annotations
import math from jobspy.scrapers import Scraper, ScraperInput
import time from jobspy.scrapers.site import Site
import random
import regex as re
from typing import Optional
from datetime import datetime
from bs4.element import Tag from ..utils import create_session, create_logger
from bs4 import BeautifulSoup from .constants import get_access_policy, headers, cookies, stringifiedObjectParams, request_id, view_ids
from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException
from urllib3.exceptions import MaxRetryError
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..exceptions import GoozaliException
from ..utils import create_session, remove_attributes, create_logger
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
Location,
JobResponse, JobResponse,
JobType,
Country,
Compensation,
DescriptionFormat,
) )
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
markdown_converter,
)
logger = create_logger("Goozali") logger = create_logger("Goozali")
class GoozaliScraper(Scraper): class GoozaliScraper(Scraper):
base_url = "https://www.Goozali.com"
delay = 3 delay = 3
band_delay = 4 band_delay = 4
jobs_per_page = 25 jobs_per_page = 25
@ -54,19 +30,26 @@ class GoozaliScraper(Scraper):
""" """
Initializes GoozaliScraper with the Goozalijob search url Initializes GoozaliScraper with the Goozalijob search url
""" """
super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert) super().__init__(site=Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
self.session = create_session( self.session = create_session(
proxies=self.proxies, proxies=self.proxies,
ca_cert=ca_cert, ca_cert=ca_cert,
is_tls=False, is_tls=False,
has_retry=True, has_retry=True,
delay=5, delay=5,
clear_cookies=True, clear_cookies=False,
) )
self.session.headers.update(headers) self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
self.scraper_input = None
self.country = "worldwide" def _get_params(self, view_id: str) -> dict[str, str]:
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+') access_policy = get_access_policy(view_id)
params = {
"stringifiedObjectParams": stringifiedObjectParams,
"request_id": request_id,
"accessPolicy": access_policy
}
return {k: v for k, v in params.items() if v is not None}
def scrape(self, scraper_input: ScraperInput) -> JobResponse: def scrape(self, scraper_input: ScraperInput) -> JobResponse:
""" """
@ -77,189 +60,29 @@ class GoozaliScraper(Scraper):
self.scraper_input = scraper_input self.scraper_input = scraper_input
job_list: list[JobPost] = [] job_list: list[JobPost] = []
seen_ids = set() seen_ids = set()
for view_id in view_ids:
# create url # create url
url = self.base_url.format(view_id=view_id)
params = self._get_params(view_id)
# create session -> run the api # create session -> run the api
try:
response = self.session.get(
url=url,
params=params,
timeout=10,
headers=headers,
cookies=cookies
)
logger.info(f"response: {str(response)}")
if (response.status_code != 200):
logger.error(f"Status code: {
response.status_code}, Error: {str(response.text)}")
return JobResponse(jobs=job_list)
except Exception as e:
logger.error(f"Exception: {str(e)}")
# model the response with models # model the response with models
# create map columnId to Column object # create map columnId to Column object
# filter result by Field like the web # filter result by Field like the web
# filter by date # filter by date
# map to JobResponse Object # map to JobResponse Object
return JobResponse(jobs=job_list) return JobResponse(jobs=job_list)
def _get_job_details(self, job_id: str) -> dict:
"""
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: dict
"""
try:
response = self.session.get(
f"{self.base_url}/jobs/view/{job_id}", timeout=5
)
response.raise_for_status()
except:
return {}
if "Goozali.com/signup" in response.url:
return {}
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
description = None
if div_content is not None:
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)
job_function = None
if h3_tag:
job_function_span = h3_tag.find_next(
"span", class_="description__job-criteria-text"
)
if job_function_span:
job_function = job_function_span.text.strip()
company_logo = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
)
return {
"description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"company_logo": company_logo,
"job_function": job_function,
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
location = Location(country=Country.from_string(self.country))
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
city=city,
state=state,
country=Country.from_string(self.country),
)
elif len(parts) == 3:
city, state, country = parts
country = Country.from_string(country)
location = Location(city=city, state=state, country=country)
return location
@staticmethod
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")

View File

@ -1,8 +1,62 @@
view_ids = ["viwIOzPYaUGxlA0Jd"]
headers = { headers = {
"authority": "www.linkedin.com", "accept": "*/*",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7",
"accept-language": "en-US,en;q=0.9", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"cache-control": "max-age=0", "x-airtable-accept-msgpack": "true",
"upgrade-insecure-requests": "1", "x-airtable-application-id": "appwewqLk7iUY4azc",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "x-requested-with": "XMLHttpRequest"
} }
session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
cookies = {
"__Host-airtable-session": "eyJzZXNzaW9uSWQiOiJzZXNxdFV4bVdKRVRoVGtRMiIsImNzcmZTZWNyZXQiOiIyT0JrVTJkU2I4bDA3NFZIRmd6eTdjTHUifQ==",
"__Host-airtable-session.sig": "heWRrVH73Aa-2ALrH4c_CbvQqTNbNRv9VjPZYv3aHJ4",
"brw": "brwtN7N3OgPFrtfb2",
"brwConsent": "opt-in",
"acq": "eyJhY3F1aXNpdGlvbiI6Ilt7XCJwbGF0Zm9ybVwiOlwiZGVza3RvcFwiLFwib3JpZ2luXCI6XCJsb2dpblwiLFwidG91Y2hUaW1lXCI6XCIyMDI0LTEyLTEyVDE3OjU1OjQyLjU3OVpcIn1dIn0=",
"acq.sig": "5xrqXjip4IJZxIeSPCkajWt_wlBmGw-k7HJCj8wicxU",
"AWSALBTGCORS": "YoIaU+wibkMfutpYUIlGnvYmnUa0VjM2ukwIhESaxfQUNL+PkCcRm5MIXVI5Q+dNJn7rAfdvTlrSF8XXU7wIWQqg8DQn2+OmvFeR5uzreWH5QaRIodTZ5gVQpXK1A62oDSR18fgyIOBRza2wIiet/67JgimPxGpuecdbz2oUwr7UqifGVz0="
}
request_id = "req4q4tKw3woEEWxw&"
share_id = "shrQBuWjXd0YgPqV6"
application_id = "appwewqLk7iUY4azc"
signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
def get_access_policy(view_id: str) -> dict[str, str]:
return {
"allowedActions": [
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "readSharedViewData"
},
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "getMetadataForPrinting"
},
{
"modelClassName": "view",
"modelIdSelector": view_id,
"action": "readSignedAttachmentUrls"
},
{
"modelClassName": "row",
"modelIdSelector": f"rows *[displayedInView={view_id}]",
"action": "createDocumentPreviewSession"
}
],
"shareId": share_id,
"applicationId": application_id,
"generationNumber": 0,
# "expires": "2025-01-02T00:00:00.000Z", # todo:: check how to set it
"signature": signature
}
stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}

View File

@ -0,0 +1,10 @@
from enum import Enum
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
GOOZALI = "goozali"

View File

@ -7,6 +7,9 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
# URL Example
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
def test_goozali(): def test_goozali():
result = scrape_jobs( result = scrape_jobs(