From f2d5bb6cfa4c73655ddd2503c583f6d32f02ee6d Mon Sep 17 00:00:00 2001 From: Yariv Menachem Date: Wed, 18 Dec 2024 16:28:55 +0200 Subject: [PATCH] maped to job post object fix the location by goozali ids --- src/jobspy/jobs/__init__.py | 8 +++ src/jobspy/scrapers/goozali/GoozaliMapper.py | 26 ++++++- src/jobspy/scrapers/goozali/__init__.py | 1 - src/jobspy/scrapers/goozali/constants.py | 76 +++++--------------- src/tests/test_goozali.py | 16 +++-- 5 files changed, 61 insertions(+), 66 deletions(-) diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index beef998..7ad75a3 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -185,6 +185,7 @@ class Location(BaseModel): country: Country | str | None = None city: Optional[str] = None state: Optional[str] = None + text: str = None def display_location(self) -> str: location_parts = [] @@ -253,6 +254,12 @@ class DescriptionFormat(Enum): class JobPost(BaseModel): + + # def __init__(self, obj): + # super().__init__() + # for key, value in obj.items(): + # setattr(self, key, value) + id: str | None = None title: str company_name: str | None @@ -271,6 +278,7 @@ class JobPost(BaseModel): emails: list[str] | None = None is_remote: bool | None = None listing_type: str | None = None + field: str | None = None # linkedin specific job_level: str | None = None diff --git a/src/jobspy/scrapers/goozali/GoozaliMapper.py b/src/jobspy/scrapers/goozali/GoozaliMapper.py index fbcf57b..a986a4e 100644 --- a/src/jobspy/scrapers/goozali/GoozaliMapper.py +++ b/src/jobspy/scrapers/goozali/GoozaliMapper.py @@ -1,10 +1,12 @@ +from datetime import datetime import json -from jobspy.jobs import JobPost +from jobspy.jobs import JobPost, Location from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData +from .constants import job_post_column_to_goozali_column, job_post_column_names # Mapping function to convert parsed dictionary into GoozaliResponseData @@ -77,6 +79,24 @@ class GoozaliMapper: # Return a new GoozaliResponse with msg and the converted data return GoozaliResponse(msg=data['msg'], data=data_obj) - def map_goozali_response_to_job_post(self, row: GoozaliRow, columns: dict[str, GoozaliColumn]) -> JobPost: + def get_value_by_job_post_Id(self, job_post_column: str, row: GoozaliRow, dict_column_name_to_columnZ): + goozali_column_name = job_post_column_to_goozali_column[job_post_column] + column = dict_column_name_to_columnZ[goozali_column_name] + value = row.cellValuesByColumnId[column.id] + if (job_post_column == "location"): + # todo: fix it + return Location(text="tel aviv") - return JobPost() + if (job_post_column == "date_posted"): + return datetime.fromisoformat(value.replace("Z", "")).date() + + return str(value) + + def map_goozali_response_to_job_post(self, row: GoozaliRow, dict_column_name_to_column) -> JobPost: + temp = {} + for col in job_post_column_names: + value = self.get_value_by_job_post_Id( + col, row, dict_column_name_to_column) + temp[col] = value + + return JobPost.model_validate(temp) diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index cb25d08..b7693fc 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -18,7 +18,6 @@ from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoic from jobspy.scrapers.site import Site from ..utils import create_session, create_logger -from .constants import headers from ...jobs import ( JobPost, JobResponse, diff --git a/src/jobspy/scrapers/goozali/constants.py b/src/jobspy/scrapers/goozali/constants.py index 88adcf3..23b8297 100644 --- a/src/jobspy/scrapers/goozali/constants.py +++ b/src/jobspy/scrapers/goozali/constants.py @@ -1,59 +1,21 @@ -import json - - -view_ids = ["viwIOzPYaUGxlA0Jd"] - -headers = { - 'accept': '*/*', - 'accept-language': 'en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7', - 'priority': 'u=1, i', - 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'sec-fetch-dest': 'empty', - 'sec-fetch-mode': 'cors', - 'sec-fetch-site': 'same-origin', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', - 'x-airtable-accept-msgpack': 'true', - 'x-airtable-application-id': 'appwewqLk7iUY4azc', - 'x-airtable-inter-service-client': 'webClient', - 'x-airtable-page-load-id': 'pglqAAzFDZEWCEC7s', - 'x-early-prefetch': 'true', - 'x-requested-with': 'XMLHttpRequest', - 'x-time-zone': 'Asia/Jerusalem', - 'x-user-locale': 'en' +job_post_column_to_goozali_column = { + "date_posted": "Discovered", + "field": "Field", + "title": "Job Title", + "job_url": "Position Link", + "company_name": "Company", + "description": "Requirements", + "location": "Location", + "company_industry": "Company Industry", + "id": "Job ID" } -session_id = "lWt/xRLIQas/blkys/2YBYl0priNI7gv85sXXtmkrW+TzbLHR8Vm6iY5RDialmLUYsQgLab8uWZyahWRw0HizxdOXhJxd5FB66H85GpUAX8zZbAZPZdUHvzxjaVa130w14QSXDa8OmsNlpKtiUtZ/DXMTOZ1wYDWC4tVJTKJ171wyKA7C9E=" - -cookies = {} - -request_id = "req4q4tKw3woEEWxw&" -share_id = "shrQBuWjXd0YgPqV6" -application_id = "appwewqLk7iUY4azc" -signature = "be8bd40c133f051f929ebab311c416013f5af0d5acae4264575b88ccf051ee59" - - -def get_access_policy(view_id: str) -> dict[str, str]: - access_policy = { - "allowedActions": [ - {"modelClassName": "view", "modelIdSelector": view_id, - "action": "readSharedViewData"}, - {"modelClassName": "view", "modelIdSelector": view_id, - "action": "getMetadataForPrinting"}, - {"modelClassName": "view", "modelIdSelector": view_id, - "action": "readSignedAttachmentUrls"}, - {"modelClassName": "row", "modelIdSelector": f"rows *[displayedInView={view_id}]", - "action": "createDocumentPreviewSession"} - ], - "shareId": share_id, - "applicationId": application_id, - "generationNumber": 0, - "expires": "2025-01-02T00:00:00.000Z", - "signature": signature - } - # Convert to a JSON string - return json.dumps(access_policy) - - -stringifiedObjectParams = {"shouldUseNestedResponseFormat": "true"} +job_post_column_names = ["id", + "date_posted", + "field", + "title", + "job_url", + "company_name", + "description", + "location", + "company_industry"] diff --git a/src/tests/test_goozali.py b/src/tests/test_goozali.py index a0e0af3..8ed2f11 100644 --- a/src/tests/test_goozali.py +++ b/src/tests/test_goozali.py @@ -3,6 +3,7 @@ import os from jobspy import scrape_jobs import pandas as pd +from jobspy.jobs import JobPost from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent from jobspy.scrapers.goozali.model import GoozaliColumn @@ -35,13 +36,18 @@ try: filtered_rows_by_age_and_column_choice = component.filter_rows_by_hours( filtered_rows_by_column_choice, hours_old) - # Key mapper: Extract 'id' as the key - def extract_goozali_column_id(column): return column.id if isinstance( + # Key mapper: Extract 'name' as the key + def extract_goozali_column_name(column): return column.name if isinstance( column, GoozaliColumn) else None - dict_column_id_to_column = create_dict_by_key_and_value( - response_data.columns, extract_goozali_column_id) + dict_column_name_to_column = create_dict_by_key_and_value( + response_data.columns, extract_goozali_column_name) + response: list[JobPost] = [] + for row in filtered_rows_by_age_and_column_choice: + job_post = mapper.map_goozali_response_to_job_post( + row, dict_column_name_to_column) + response.append(job_post) - print("hello heloo") + print("kingggggg") except FileNotFoundError: print("The file was not found.") except json.JSONDecodeError: