init commit

models ready, logic file for goozali based on the linkdin scraper
pull/231/head
Yariv Menachem 2024-12-12 17:22:58 +02:00
parent f0ea89b357
commit 3dc15195d5
13 changed files with 605 additions and 10 deletions

View File

@ -1,16 +1,26 @@
import asyncio import asyncio
from enum import Enum
from db.job_repository import JobRepository from db.job_repository import JobRepository
from jobspy import scrape_jobs from jobspy import scrape_jobs
from jobspy.telegram_bot import TelegramBot from jobspy.telegram_bot import TelegramBot
class Site(Enum):
LINKEDIN = "linkedin"
GOOZALI = "goozali"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
async def main(): async def main():
telegramBot = TelegramBot() telegramBot = TelegramBot()
jobRepository = JobRepository() jobRepository = JobRepository()
jobs = scrape_jobs( jobs = scrape_jobs(
# site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google"], # site_name=[Site.LINKEDIN, Site.GOOZALI, Site.GLASSDOOR, Site.INDEED],
site_name=["indeed"], site_name=[Site.GOOZALI],
search_term="software engineer", search_term="software engineer",
google_search_term="software engineer jobs near Tel Aviv Israel since yesterday", google_search_term="software engineer jobs near Tel Aviv Israel since yesterday",
location="Central, Israel", location="Central, Israel",

View File

@ -2,6 +2,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from jobspy.main import Site
from ..jobs import ( from ..jobs import (
Enum, Enum,
BaseModel, BaseModel,
@ -12,14 +14,6 @@ from ..jobs import (
) )
class Site(Enum):
LINKEDIN = "linkedin"
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
GOOGLE = "google"
class SalarySource(Enum): class SalarySource(Enum):
DIRECT_DATA = "direct_data" DIRECT_DATA = "direct_data"
DESCRIPTION = "description" DESCRIPTION = "description"

View File

@ -0,0 +1,439 @@
"""
jobspy.scrapers.Goozali
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Goozali.
"""
from __future__ import annotations
import math
import time
import random
import regex as re
from typing import Optional
from datetime import datetime
from bs4.element import Tag
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, unquote
from requests.exceptions import RetryError, RequestException
from urllib3.exceptions import MaxRetryError
from .constants import headers
from .. import Scraper, ScraperInput, Site
from ..exceptions import GoozaliException
from ..utils import create_session, remove_attributes, create_logger
from ...jobs import (
JobPost,
Location,
JobResponse,
JobType,
Country,
Compensation,
DescriptionFormat,
)
from ..utils import (
extract_emails_from_text,
get_enum_from_job_type,
currency_parser,
markdown_converter,
)
logger = create_logger("Goozali")
class GoozaliScraper(Scraper):
base_url = "https://www.Goozali.com"
delay = 3
band_delay = 4
jobs_per_page = 25
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
):
"""
Initializes GoozaliScraper with the Goozalijob search url
"""
super().__init__(Site.GOOZALI, proxies=proxies, ca_cert=ca_cert)
self.session = create_session(
proxies=self.proxies,
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
clear_cookies=True,
)
self.session.headers.update(headers)
self.scraper_input = None
self.country = "worldwide"
self.job_url_direct_regex = re.compile(r'(?<=\?url=)[^"]+')
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Goozali for jobs with scraper_input criteria
:param scraper_input:
:return: job_response
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []
seen_ids = set()
start = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
request_count = 0
seconds_old = (
scraper_input.hours_old * 3600 if scraper_input.hours_old else None
)
continue_search = (
lambda: len(
job_list) < scraper_input.results_wanted and start < 1000
)
for location in scraper_input.locations:
logger.info(f"start searching for location: {location}")
while continue_search():
request_count += 1
logger.info(
f"search page: {
request_count} / {math.ceil(scraper_input.results_wanted / 10)}"
)
params = {
"keywords": scraper_input.search_term,
"location": location,
"distance": scraper_input.distance,
"f_WT": 2 if scraper_input.is_remote else None,
"f_JT": (
self.job_type_code(scraper_input.job_type)
if scraper_input.job_type
else None
),
"pageNum": 0,
"start": start,
"f_AL": "true" if scraper_input.easy_apply else None,
"f_C": (
",".join(map(str, scraper_input.Goozali_company_ids))
if scraper_input.Goozali_company_ids
else None
),
}
if seconds_old is not None:
params["f_TPR"] = f"r{seconds_old}"
params = {k: v for k, v in params.items() if v is not None}
try:
response = self.session.get(
f"{self.base_url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
params=params,
timeout=10,
)
if response.status_code not in range(200, 400):
if response.status_code == 429:
err = (
f"429 Response - Blocked by Goozali for too many requests"
)
else:
err = f"Goozali response status code {
response.status_code}"
err += f" - {response.text}"
logger.error(err)
return JobResponse(jobs=job_list)
except MaxRetryError as e:
"""Raised when the maximum number of retries is exceeded."""
logger.error(f"RetryError: {str(e)}")
logger.error(f"MaxRetryError for location: {location}")
break
except RetryError as e:
"""Custom retries logic failed"""
logger.error(f"RetryError: {str(e)}")
logger.error(f"RetryError for location: {location}")
break
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Goozali: Bad proxy")
else:
logger.error(f"Goozali: {str(e)}")
return JobResponse(jobs=job_list)
soup = BeautifulSoup(response.text, "html.parser")
job_cards = soup.find_all("div", class_="base-search-card")
if len(job_cards) == 0:
break
for job_card in job_cards:
href_tag = job_card.find(
"a", class_="base-card__full-link")
if href_tag and "href" in href_tag.attrs:
href = href_tag.attrs["href"].split("?")[0]
job_id = href.split("-")[-1]
if job_id in seen_ids:
continue
seen_ids.add(job_id)
try:
fetch_desc = scraper_input.Goozali_fetch_description
job_post = self._process_job(
job_card, job_id, fetch_desc)
if job_post:
job_list.append(job_post)
if not continue_search():
break
except Exception as e:
raise GoozaliException(str(e))
if continue_search():
time.sleep(random.uniform(
self.delay, self.delay + self.band_delay))
start += len(job_list)
job_list = job_list[: scraper_input.results_wanted]
return JobResponse(jobs=job_list)
def _process_job(
self, job_card: Tag, job_id: str, full_descr: bool
) -> Optional[JobPost]:
salary_tag = job_card.find(
"span", class_="job-search-card__salary-info")
compensation = None
if salary_tag:
salary_text = salary_tag.get_text(separator=" ").strip()
salary_values = [currency_parser(value)
for value in salary_text.split("-")]
salary_min = salary_values[0]
salary_max = salary_values[1]
currency = salary_text[0] if salary_text[0] != "$" else "USD"
compensation = Compensation(
min_amount=int(salary_min),
max_amount=int(salary_max),
currency=currency,
)
title_tag = job_card.find("span", class_="sr-only")
title = title_tag.get_text(strip=True) if title_tag else "N/A"
company_tag = job_card.find("h4", class_="base-search-card__subtitle")
company_a_tag = company_tag.find("a") if company_tag else None
company_url = (
urlunparse(urlparse(company_a_tag.get("href"))._replace(query=""))
if company_a_tag and company_a_tag.has_attr("href")
else ""
)
company = company_a_tag.get_text(
strip=True) if company_a_tag else "N/A"
metadata_card = job_card.find(
"div", class_="base-search-card__metadata")
location = self._get_location(metadata_card)
datetime_tag = (
metadata_card.find("time", class_="job-search-card__listdate")
if metadata_card
else None
)
date_posted = None
if datetime_tag and "datetime" in datetime_tag.attrs:
datetime_str = datetime_tag["datetime"]
try:
date_posted = datetime.strptime(datetime_str, "%Y-%m-%d")
except:
date_posted = None
job_details = {}
if full_descr:
job_details = self._get_job_details(job_id)
return JobPost(
id=f"li-{job_id}",
title=title,
company_name=company,
company_url=company_url,
location=location,
date_posted=date_posted,
datetime_posted=date_posted,
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
job_level=job_details.get("job_level", "").lower(),
company_industry=job_details.get("company_industry"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
company_logo=job_details.get("company_logo"),
job_function=job_details.get("job_function"),
)
def _get_job_details(self, job_id: str) -> dict:
"""
Retrieves job description and other job details by going to the job page url
:param job_page_url:
:return: dict
"""
try:
response = self.session.get(
f"{self.base_url}/jobs/view/{job_id}", timeout=5
)
response.raise_for_status()
except:
return {}
if "Goozali.com/signup" in response.url:
return {}
soup = BeautifulSoup(response.text, "html.parser")
div_content = soup.find(
"div", class_=lambda x: x and "show-more-less-html__markup" in x
)
description = None
if div_content is not None:
div_content = remove_attributes(div_content)
description = div_content.prettify(formatter="html")
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
description = markdown_converter(description)
h3_tag = soup.find(
"h3", text=lambda text: text and "Job function" in text.strip()
)
job_function = None
if h3_tag:
job_function_span = h3_tag.find_next(
"span", class_="description__job-criteria-text"
)
if job_function_span:
job_function = job_function_span.text.strip()
company_logo = (
logo_image.get("data-delayed-url")
if (logo_image := soup.find("img", {"class": "artdeco-entity-image"}))
else None
)
return {
"description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"company_logo": company_logo,
"job_function": job_function,
}
def _get_location(self, metadata_card: Optional[Tag]) -> Location:
"""
Extracts the location data from the job metadata card.
:param metadata_card
:return: location
"""
location = Location(country=Country.from_string(self.country))
if metadata_card is not None:
location_tag = metadata_card.find(
"span", class_="job-search-card__location"
)
location_string = location_tag.text.strip() if location_tag else "N/A"
parts = location_string.split(", ")
if len(parts) == 2:
city, state = parts
location = Location(
city=city,
state=state,
country=Country.from_string(self.country),
)
elif len(parts) == 3:
city, state, country = parts
country = Country.from_string(country)
location = Location(city=city, state=state, country=country)
return location
@staticmethod
def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:
"""
Gets the job type from job page
:param soup_job_type:
:return: JobType
"""
h3_tag = soup_job_type.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Employment type" in text,
)
employment_type = None
if h3_tag:
employment_type_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if employment_type_span:
employment_type = employment_type_span.get_text(strip=True)
employment_type = employment_type.lower()
employment_type = employment_type.replace("-", "")
return [get_enum_from_job_type(employment_type)] if employment_type else []
@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)
return job_level
@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)
return industry
def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
:param soup:
:return: str
"""
job_url_direct = None
job_url_direct_content = soup.find("code", id="applyUrl")
if job_url_direct_content:
job_url_direct_match = self.job_url_direct_regex.search(
job_url_direct_content.decode_contents().strip()
)
if job_url_direct_match:
job_url_direct = unquote(job_url_direct_match.group())
return job_url_direct
@staticmethod
def job_type_code(job_type_enum: JobType) -> str:
return {
JobType.FULL_TIME: "F",
JobType.PART_TIME: "P",
JobType.INTERNSHIP: "I",
JobType.CONTRACT: "C",
JobType.TEMPORARY: "T",
}.get(job_type_enum, "")

View File

@ -0,0 +1,8 @@
headers = {
"authority": "www.linkedin.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}

View File

@ -0,0 +1,20 @@
from typing import Optional
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions
class GoozaliColumn:
def __init__(self, id: str, name: str, description: Optional[str], type: str, typeOptions: GoozaliColumnTypeOptions,
default: Optional[str], initialCreatedTime: str, initialCreatedByUserId: str,
lastModifiedTime: str, lastModifiedByUserId: str, isEditableFromSync: bool):
self.id = id
self.name = name
self.description = description
self.type = type
self.typeOptions = typeOptions
self.default = default
self.initialCreatedTime = initialCreatedTime
self.initialCreatedByUserId = initialCreatedByUserId
self.lastModifiedTime = lastModifiedTime
self.lastModifiedByUserId = lastModifiedByUserId
self.isEditableFromSync = isEditableFromSync

View File

@ -0,0 +1,8 @@
from typing import Optional
class GoozaliColumnChoice:
def __init__(self, id: str, name: str, color: Optional[str] = None):
self.id = id
self.name = name
self.color = color

View File

@ -0,0 +1,10 @@
from typing import Dict, List
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
class GoozaliColumnTypeOptions:
def __init__(self, choiceOrder: List[str], choices: Dict[str, GoozaliColumnChoice], disableColors: bool):
self.choiceOrder = choiceOrder
self.choices = choices
self.disableColors = disableColors

View File

@ -0,0 +1,7 @@
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
class GoozaliResponse:
def __init__(self, msg: str, data: GoozaliResponseData):
self.msg = msg
self.data = data

View File

@ -0,0 +1,8 @@
from typing import Dict, List
class GoozaliRow:
def __init__(self, id: str, createdTime: str, cellValuesByColumnId: Dict[str, List[str]]):
self.id = id
self.createdTime = createdTime
self.cellValuesByColumnId = cellValuesByColumnId

View File

@ -0,0 +1,17 @@
from typing import Dict, List
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
class GoozaliTable:
def __init__(self, applicationId: str, id: str, name: str, columns: List[GoozaliColumn], primaryColumnId: str,
meaningfulColumnOrder: List[Dict[str, str]], viewOrder: List[str], rows: List[GoozaliRow]):
self.applicationId = applicationId
self.id = id
self.name = name
self.columns = columns
self.primaryColumnId = primaryColumnId
self.meaningfulColumnOrder = meaningfulColumnOrder
self.viewOrder = viewOrder
self.rows = rows

View File

@ -0,0 +1,6 @@
from jobspy.scrapers.goozali.model import GoozaliTable
class GoozaliResponseData:
def __init__(self, table: GoozaliTable):
self.table = table

68
src/tests/test_goozali.py Normal file
View File

@ -0,0 +1,68 @@
from jobspy import scrape_jobs
import pandas as pd
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliTable
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
from jobspy.scrapers.goozali.model.GoozaliRow import GoozaliRow
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
def test_goozali():
result = scrape_jobs(
site_name="glassdoor",
search_term="engineer",
results_wanted=5,
)
assert (
isinstance(result, pd.DataFrame) and len(result) == 5
), "Result should be a non-empty DataFrame"
def createMockGoozaliResponse() -> GoozaliResponse:
data = GoozaliResponseData(table=GoozaliTable(
applicationId="app7OQjqEzTtCRq7u",
id="tblBQjp5Aw6O172VY",
name="Shared view table",
columns=[
GoozaliColumn(
id="fldIf9DbRpNRLJXuD",
name="Industry",
description=None,
type="multiSelect",
typeOptions=GoozaliColumnTypeOptions(
choiceOrder=["selcE6QUv4vWIIcZR",
"sel0JIQKMmz3jCFUN", "selzhpwlfPssG4OEx"],
choices={
"selwhDNBom2dZJkgv": GoozaliColumnChoice(id="selwhDNBom2dZJkgv", name="HealthTech", color="orange"),
"selReHesNOVD3PvCo": GoozaliColumnChoice(id="selReHesNOVD3PvCo", name="Automotive", color="pink")
},
disableColors=False
),
default=None,
initialCreatedTime="2022-12-29T10:23:21.000Z",
initialCreatedByUserId="usr1fVy2RIyCuGHec",
lastModifiedTime="2024-07-21T09:30:02.000Z",
lastModifiedByUserId="usr1fVy2RIyCuGHec",
isEditableFromSync=False
)
],
primaryColumnId="fldLT11B0cpV6p9Uz",
meaningfulColumnOrder=[
{"columnId": "fldLT11B0cpV6p9Uz", "visibility": True},
{"columnId": "fldIf9DbRpNRLJXuD", "visibility": True, "width": 368},
{"columnId": "fldOLt34j8Pm2dcCq", "visibility": True, "width": 182}
],
viewOrder=["viwNRSqqmqZLP0a3C"],
rows=[
GoozaliRow(
id="recwiKgHT9mJrqoxa",
createdTime="2023-01-09T10:32:09.000Z",
cellValuesByColumnId={
"fldLT11B0cpV6p9Uz": ["3M"],
"fldIf9DbRpNRLJXuD": ["selwhDNBom2dZJkgv", "selReHesNOVD3PvCo"]
}
)
]
))
return GoozaliResponse(msg="SUCCESS", data=data)