maped to job post object

fix the location by goozali ids
pull/231/head
Yariv Menachem 2024-12-18 16:28:55 +02:00
parent ead8eb126d
commit f2d5bb6cfa
5 changed files with 61 additions and 66 deletions

View File

@ -185,6 +185,7 @@ class Location(BaseModel):
country: Country | str | None = None
city: Optional[str] = None
state: Optional[str] = None
text: str = None
def display_location(self) -> str:
location_parts = []
@ -253,6 +254,12 @@ class DescriptionFormat(Enum):
class JobPost(BaseModel):
# def __init__(self, obj):
# super().__init__()
# for key, value in obj.items():
# setattr(self, key, value)
id: str | None = None
title: str
company_name: str | None
@ -271,6 +278,7 @@ class JobPost(BaseModel):
emails: list[str] | None = None
is_remote: bool | None = None
listing_type: str | None = None
field: str | None = None
# linkedin specific
job_level: str | None = None

View File

@ -1,10 +1,12 @@
from datetime import datetime
import json
from jobspy.jobs import JobPost
from jobspy.jobs import JobPost, Location
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
from .constants import job_post_column_to_goozali_column, job_post_column_names
# Mapping function to convert parsed dictionary into GoozaliResponseData
@ -77,6 +79,24 @@ class GoozaliMapper:
# Return a new GoozaliResponse with msg and the converted data
return GoozaliResponse(msg=data['msg'], data=data_obj)
def map_goozali_response_to_job_post(self, row: GoozaliRow, columns: dict[str, GoozaliColumn]) -> JobPost:
def get_value_by_job_post_Id(self, job_post_column: str, row: GoozaliRow, dict_column_name_to_columnZ):
goozali_column_name = job_post_column_to_goozali_column[job_post_column]
column = dict_column_name_to_columnZ[goozali_column_name]
value = row.cellValuesByColumnId[column.id]
if (job_post_column == "location"):
# todo: fix it
return Location(text="tel aviv")
return JobPost()
if (job_post_column == "date_posted"):
return datetime.fromisoformat(value.replace("Z", "")).date()
return str(value)
def map_goozali_response_to_job_post(self, row: GoozaliRow, dict_column_name_to_column) -> JobPost:
temp = {}
for col in job_post_column_names:
value = self.get_value_by_job_post_Id(
col, row, dict_column_name_to_column)
temp[col] = value
return JobPost.model_validate(temp)

View File

@ -18,7 +18,6 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic
from jobspy.scrapers.site import Site
from ..utils import create_session, create_logger
from .constants import headers
from ...jobs import (
JobPost,
JobResponse,

View File

@ -1,59 +1,21 @@
import json
view_ids = ["viwIOzPYaUGxlA0Jd"]
headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7',
'priority': 'u=1, i',
'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
'x-airtable-accept-msgpack': 'true',
'x-airtable-application-id': 'appwewqLk7iUY4azc',
'x-airtable-inter-service-client': 'webClient',
'x-airtable-page-load-id': 'pglqAAzFDZEWCEC7s',
'x-early-prefetch': 'true',
'x-requested-with': 'XMLHttpRequest',
'x-time-zone': 'Asia/Jerusalem',
'x-user-locale': 'en'
job_post_column_to_goozali_column = {
"date_posted": "Discovered",
"field": "Field",
"title": "Job Title",
"job_url": "Position Link",
"company_name": "Company",
"description": "Requirements",
"location": "Location",
"company_industry": "Company Industry",
"id": "Job ID"
}
session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E="
cookies = {}
request_id = "req4q4tKw3woEEWxw&"
share_id = "shrQBuWjXd0YgPqV6"
application_id = "appwewqLk7iUY4azc"
signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59"
def get_access_policy(view_id: str) -> dict[str, str]:
access_policy = {
"allowedActions": [
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "readSharedViewData"},
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "getMetadataForPrinting"},
{"modelClassName": "view", "modelIdSelector": view_id,
"action": "readSignedAttachmentUrls"},
{"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={view_id}]",
"action": "createDocumentPreviewSession"}
],
"shareId": share_id,
"applicationId": application_id,
"generationNumber": 0,
"expires": "2025-01-02T00:00:00.000Z",
"signature": signature
}
# Convert to a JSON string
return json.dumps(access_policy)
stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"}
job_post_column_names = ["id",
"date_posted",
"field",
"title",
"job_url",
"company_name",
"description",
"location",
"company_industry"]

View File

@ -3,6 +3,7 @@ import os
from jobspy import scrape_jobs
import pandas as pd
from jobspy.jobs import JobPost
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
from jobspy.scrapers.goozali.model import GoozaliColumn
@ -35,13 +36,18 @@ try:
filtered_rows_by_age_and_column_choice = component.filter_rows_by_hours(
filtered_rows_by_column_choice, hours_old)
# Key mapper: Extract 'id' as the key
def extract_goozali_column_id(column): return column.id if isinstance(
# Key mapper: Extract 'name' as the key
def extract_goozali_column_name(column): return column.name if isinstance(
column, GoozaliColumn) else None
dict_column_id_to_column = create_dict_by_key_and_value(
response_data.columns, extract_goozali_column_id)
dict_column_name_to_column = create_dict_by_key_and_value(
response_data.columns, extract_goozali_column_name)
response: list[JobPost] = []
for row in filtered_rows_by_age_and_column_choice:
job_post = mapper.map_goozali_response_to_job_post(
row, dict_column_name_to_column)
response.append(job_post)
print("hello heloo")
print("kingggggg")
except FileNotFoundError:
print("The file was not found.")
except json.JSONDecodeError: