maped to job post object

fix the location by goozali ids
pull/231/head
Yariv Menachem 2024-12-18 16:28:55 +02:00
parent ead8eb126d
commit f2d5bb6cfa
5 changed files with 61 additions and 66 deletions

View File

@ -185,6 +185,7 @@ class Location(BaseModel):
country: Country | str | None = None country: Country | str | None = None
city: Optional[str] = None city: Optional[str] = None
state: Optional[str] = None state: Optional[str] = None
text: str = None
def display_location(self) -> str: def display_location(self) -> str:
location_parts = [] location_parts = []
@ -253,6 +254,12 @@ class DescriptionFormat(Enum):
class JobPost(BaseModel): class JobPost(BaseModel):
# def __init__(self, obj):
# super().__init__()
# for key, value in obj.items():
# setattr(self, key, value)
id: str | None = None id: str | None = None
title: str title: str
company_name: str | None company_name: str | None
@ -271,6 +278,7 @@ class JobPost(BaseModel):
emails: list[str] | None = None emails: list[str] | None = None
is_remote: bool | None = None is_remote: bool | None = None
listing_type: str | None = None listing_type: str | None = None
field: str | None = None
# linkedin specific # linkedin specific
job_level: str | None = None job_level: str | None = None

View File

@ -1,10 +1,12 @@
from datetime import datetime
import json import json
from jobspy.jobs import JobPost from jobspy.jobs import JobPost, Location
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
from .constants import job_post_column_to_goozali_column, job_post_column_names
# Mapping function to convert parsed dictionary into GoozaliResponseData # Mapping function to convert parsed dictionary into GoozaliResponseData
@ -77,6 +79,24 @@ class GoozaliMapper:
# Return a new GoozaliResponse with msg and the converted data # Return a new GoozaliResponse with msg and the converted data
return GoozaliResponse(msg=data['msg'], data=data_obj) return GoozaliResponse(msg=data['msg'], data=data_obj)
def map_goozali_response_to_job_post(self, row: GoozaliRow, columns: dict[str, GoozaliColumn]) -> JobPost: def get_value_by_job_post_Id(self, job_post_column: str, row: GoozaliRow, dict_column_name_to_columnZ):
goozali_column_name = job_post_column_to_goozali_column[job_post_column]
column = dict_column_name_to_columnZ[goozali_column_name]
value = row.cellValuesByColumnId[column.id]
if (job_post_column == "location"):
# todo: fix it
return Location(text="tel aviv")
return JobPost() if (job_post_column == "date_posted"):
return datetime.fromisoformat(value.replace("Z", "")).date()
return str(value)
def map_goozali_response_to_job_post(self, row: GoozaliRow, dict_column_name_to_column) -> JobPost:
temp = {}
for col in job_post_column_names:
value = self.get_value_by_job_post_Id(
col, row, dict_column_name_to_column)
temp[col] = value
return JobPost.model_validate(temp)

View File

@ -18,7 +18,6 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
from jobspy.scrapers.site import Site from jobspy.scrapers.site import Site
from ..utils import create_session, create_logger from ..utils import create_session, create_logger
from .constants import headers
from ...jobs import ( from ...jobs import (
JobPost, JobPost,
JobResponse, JobResponse,

View File

@ -1,59 +1,21 @@
import json job_post_column_to_goozali_column = {
"date_posted": "Discovered",
"field": "Field",
view_ids = ["viwIOzPYaUGxlA0Jd"] "title": "Job Title",
"job_url": "Position Link",
headers = { "company_name": "Company",
'accept': '*/*', "description": "Requirements",
'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7', "location": "Location",
'priority': 'u=1, i', "company_industry": "Company Industry",
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', "id": "Job ID"
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'x-airtable-accept-msgpack': 'true',
'x-airtable-application-id': 'appwewqLk7iUY4azc',
'x-airtable-inter-service-client': 'webClient',
'x-airtable-page-load-id': 'pglqAAzFDZEWCEC7s',
'x-early-prefetch': 'true',
'x-requested-with': 'XMLHttpRequest',
'x-time-zone': 'Asia/Jerusalem',
'x-user-locale': 'en'
} }
session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E=" job_post_column_names = ["id",
"date_posted",
cookies = {} "field",
"title",
request_id = "req4q4tKw3woEEWxw&" "job_url",
share_id = "shrQBuWjXd0YgPqV6" "company_name",
application_id = "appwewqLk7iUY4azc" "description",
signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" "location",
"company_industry"]
def get_access_policy(view_id: str) -> dict[str, str]:
access_policy = {
"allowedActions": [
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "readSharedViewData"},
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "getMetadataForPrinting"},
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "readSignedAttachmentUrls"},
{"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={view_id}]",
"action": "createDocumentPreviewSession"}
],
"shareId": share_id,
"applicationId": application_id,
"generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z",
"signature": signature
}
# Convert to a JSON string
return json.dumps(access_policy)
stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}

View File

@ -3,6 +3,7 @@ import os
from jobspy import scrape_jobs from jobspy import scrape_jobs
import pandas as pd import pandas as pd
from jobspy.jobs import JobPost
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
from jobspy.scrapers.goozali.model import GoozaliColumn from jobspy.scrapers.goozali.model import GoozaliColumn
@ -35,13 +36,18 @@ try:
filtered_rows_by_age_and_column_choice = component.filter_rows_by_hours( filtered_rows_by_age_and_column_choice = component.filter_rows_by_hours(
filtered_rows_by_column_choice, hours_old) filtered_rows_by_column_choice, hours_old)
# Key mapper: Extract 'id' as the key # Key mapper: Extract 'name' as the key
def extract_goozali_column_id(column): return column.id if isinstance( def extract_goozali_column_name(column): return column.name if isinstance(
column, GoozaliColumn) else None column, GoozaliColumn) else None
dict_column_id_to_column = create_dict_by_key_and_value( dict_column_name_to_column = create_dict_by_key_and_value(
response_data.columns, extract_goozali_column_id) response_data.columns, extract_goozali_column_name)
response: list[JobPost] = []
for row in filtered_rows_by_age_and_column_choice:
job_post = mapper.map_goozali_response_to_job_post(
row, dict_column_name_to_column)
response.append(job_post)
print("hello heloo") print("kingggggg")
except FileNotFoundError: except FileNotFoundError:
print("The file was not found.") print("The file was not found.")
except json.JSONDecodeError: except json.JSONDecodeError: