diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index c4645d1..d7b9b06 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -1,6 +1,4 @@ from __future__ import annotations -from datetime import datetime -from enum import Enum import pandas as pd from typing import Tuple @@ -49,7 +47,7 @@ def scrape_jobs( hours_old: int = None, enforce_annual_salary: bool = False, verbose: int = 2, - **kwargs, + ** kwargs, ) -> pd.DataFrame: """ Simultaneously scrapes job data from multiple job sites. @@ -90,7 +88,6 @@ def scrape_jobs( return site_types country_enum = Country.from_string(country_indeed) - scraper_input = ScraperInput( site_type=get_site_type(), country=country_enum, @@ -107,7 +104,7 @@ def scrape_jobs( results_wanted=results_wanted, linkedin_company_ids=linkedin_company_ids, offset=offset, - hours_old=hours_old, + hours_old=hours_old ) def scrape_site(site: Site) -> Tuple[str, JobResponse]: diff --git a/src/jobspy/main.py b/src/jobspy/main.py index 1050a90..06939e6 100644 --- a/src/jobspy/main.py +++ b/src/jobspy/main.py @@ -35,7 +35,7 @@ async def main(): "Central, Israel", "Rehovot ,Israel"], results_wanted=200, hours_old=200, - country_indeed='israel', + country_indeed='israel' ) logger.info(f"Found {len(jobs)} jobs") jobs = list(filter(filter_jobs_by_title_name, jobs)) diff --git a/src/jobspy/scrapers/__init__.py b/src/jobspy/scrapers/__init__.py index 861d269..5ff6852 100644 --- a/src/jobspy/scrapers/__init__.py +++ b/src/jobspy/scrapers/__init__.py @@ -4,7 +4,6 @@ from abc import ABC, abstractmethod from jobspy.scrapers.site import Site - from ..jobs import ( Enum, BaseModel, diff --git a/src/jobspy/scrapers/goozali/GoozaliMapper.py b/src/jobspy/scrapers/goozali/GoozaliMapper.py index 5858b80..d63515a 100644 --- a/src/jobspy/scrapers/goozali/GoozaliMapper.py +++ b/src/jobspy/scrapers/goozali/GoozaliMapper.py @@ -2,10 +2,7 @@ from datetime import datetime import json from jobspy.jobs import JobPost, Location -from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow -from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn -from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice -from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData +from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData from .constants import job_post_column_to_goozali_column, job_post_column_names # Mapping function to convert parsed dictionary into GoozaliResponseData diff --git a/src/jobspy/scrapers/goozali/GoozaliScrapperComponent.py b/src/jobspy/scrapers/goozali/GoozaliScrapperComponent.py index 149560a..479f17a 100644 --- a/src/jobspy/scrapers/goozali/GoozaliScrapperComponent.py +++ b/src/jobspy/scrapers/goozali/GoozaliScrapperComponent.py @@ -1,8 +1,6 @@ from datetime import datetime, timedelta -from jobspy.scrapers.goozali.model import GoozaliRow -from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn -from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice +from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice from jobspy.scrapers.utils import create_logger # Mapping function to convert parsed dictionary into GoozaliResponseData diff --git a/src/jobspy/scrapers/goozali/__init__.py b/src/jobspy/scrapers/goozali/__init__.py index 7de4d7b..1cd2224 100644 --- a/src/jobspy/scrapers/goozali/__init__.py +++ b/src/jobspy/scrapers/goozali/__init__.py @@ -8,11 +8,11 @@ This module contains routines to scrape Goozali. from __future__ import annotations -from jobspy.scrapers import Scraper, ScraperInput +from .. import Scraper, ScraperInput from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent -from jobspy.scrapers.goozali.constants import CHOICE_FIELD_KEY, extract_goozali_column_name, job_post_column_to_goozali_column -from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliPartRequest, GoozaliFullRequest +from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column +from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest from jobspy.scrapers.site import Site from ..utils import create_dict_by_key_and_value, create_session, create_logger @@ -45,7 +45,6 @@ class GoozaliScraper(Scraper): ) self.mapper = GoozaliMapper() self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData" - self.view_ids = ["viwIOzPYaUGxlA0Jd"] self.component = GoozaliScrapperComponent() def scrape(self, scraper_input: ScraperInput) -> JobResponse: @@ -56,43 +55,41 @@ class GoozaliScraper(Scraper): """ self.scraper_input = scraper_input job_list: list[JobPost] = [] - seen_ids = set() - for view_id in self.view_ids: - full_request = GoozaliFullRequest(self.base_url) - part_request = GoozaliPartRequest(self.base_url) - try: - response = self.session.get( - url=full_request.url, - params=full_request.params, - timeout=10, - headers=full_request.headers, - cookies=full_request.cookies) - logger.info(f"response: {str(response)}") - if (response.status_code != 200): - logger.error(f"Status code: {response.status_code}, Error: { - str(response.text)}") - return JobResponse(jobs=job_list) - except Exception as e: - logger.error(f"Exception: {str(e)}") - # model the response with models - goozali_response = self.mapper.map_response_to_goozali_response( - response=response) - # suggestL create groupby field and then filter by hours - # filter result by Field - column = self.component.find_column( - goozali_response.data.columns, job_post_column_to_goozali_column["field"]) - column_choice = self.component.find_choice_from_column( - column, CHOICE_FIELD_KEY) - filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice( - goozali_response.data.rows, column, column_choice) - filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours( - filtered_rows_by_column_choice, scraper_input.hours_old) - dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value( - goozali_response.data.columns, extract_goozali_column_name) - # map to JobResponse Object - for row in filtered_rows_by_age_and_column_choice: - job_post = self.mapper.map_goozali_response_to_job_post( - row, dict_column_name_to_column) - job_list.append(job_post) + full_request = GoozaliFullRequest(self.base_url) + part_request = GoozaliPartRequest(self.base_url) + try: + response = self.session.get( + url=full_request.url, + params=full_request.params, + timeout=10, + headers=full_request.headers, + cookies=full_request.cookies) + logger.info(f"response: {str(response)}") + if (response.status_code != 200): + logger.error(f"Status code: {response.status_code}, Error: { + str(response.text)}") + return JobResponse(jobs=job_list) + except Exception as e: + logger.error(f"Exception: {str(e)}") + # model the response with models + goozali_response = self.mapper.map_response_to_goozali_response( + response=response) + # suggestL create groupby field and then filter by hours + # filter result by Field + column = self.component.find_column( + goozali_response.data.columns, job_post_column_to_goozali_column["field"]) + column_choice = self.component.find_choice_from_column( + column, GoozaliFieldChoice.SOFTWARE_ENGINEERING.value) + filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice( + goozali_response.data.rows, column, column_choice) + filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours( + filtered_rows_by_column_choice, scraper_input.hours_old) + dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value( + goozali_response.data.columns, extract_goozali_column_name) + # map to JobResponse Object + for row in filtered_rows_by_age_and_column_choice: + job_post = self.mapper.map_goozali_response_to_job_post( + row, dict_column_name_to_column) + job_list.append(job_post) - return JobResponse(jobs=job_list) + return JobResponse(jobs=job_list) diff --git a/src/jobspy/scrapers/goozali/constants.py b/src/jobspy/scrapers/goozali/constants.py index b4a074c..13051e1 100644 --- a/src/jobspy/scrapers/goozali/constants.py +++ b/src/jobspy/scrapers/goozali/constants.py @@ -13,8 +13,6 @@ job_post_column_to_goozali_column = { "id": "Job ID" } -CHOICE_FIELD_KEY = "Software Engineering" - job_post_column_names = ["id", "date_posted", "field", diff --git a/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py b/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py index 1c01af5..60ecaab 100644 --- a/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py +++ b/src/jobspy/scrapers/goozali/model/GoozaliColumnTypeOptions.py @@ -1,4 +1,4 @@ -from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice +from jobspy.scrapers.goozali.model import GoozaliColumnChoice class GoozaliColumnTypeOptions: diff --git a/src/jobspy/scrapers/goozali/model/GoozaliFieldChoice.py b/src/jobspy/scrapers/goozali/model/GoozaliFieldChoice.py new file mode 100644 index 0000000..88f3bb2 --- /dev/null +++ b/src/jobspy/scrapers/goozali/model/GoozaliFieldChoice.py @@ -0,0 +1,31 @@ +from enum import Enum + + +class GoozaliFieldChoice(Enum): + PRODUCT_MANAGEMENT = "Product Management" + DATA_ANALYST = "Data Analyst" + DATA_SCIENCE_ML_ALGORITHMS = "Data Science, ML & Algorithms" + SOFTWARE_ENGINEERING = "Software Engineering" + QA = "QA" + CYBERSECURITY = "Cybersecurity" + IT_AND_SYSTEM_ADMINISTRATION = "IT and System Administration" + FRONTEND_DEVELOPMENT = "Frontend Development" + DEVOPS = "DevOps" + UI_UX_DESIGN_CONTENT = "UI/UX, Design & Content" + HR_RECRUITMENT = "HR & Recruitment" + MOBILE_DEVELOPMENT = "Mobile Development" + HARDWARE_ENGINEERING = "Hardware Engineering" + EMBEDDED_LOW_LEVEL_FIRMWARE_ENGINEERING = "Embedded, Low Level & Firmware Engineering" + CUSTOMER_SUCCESS = "Customer Success" + PROJECT_MANAGEMENT = "Project Management" + OPERATIONS = "Operations" + FINANCE = "Finance" + SYSTEMS_ENGINEERING = "Systems Engineering" + MARKETING = "Marketing" + SALES = "Sales" + COMPLIANCE_LEGAL_POLICY = "Compliance, Legal & Policy" + C_LEVEL = "C-Level" + BUSINESS_DEVELOPMENT = "Business Development" + MECHANICAL_ENGINEERING = "Mechanical Engineering" + NATURAL_SCIENCE = "Natural Science" + OTHER = "Other" diff --git a/src/jobspy/scrapers/goozali/model/FullRequest.py b/src/jobspy/scrapers/goozali/model/GoozaliFullRequest.py similarity index 100% rename from src/jobspy/scrapers/goozali/model/FullRequest.py rename to src/jobspy/scrapers/goozali/model/GoozaliFullRequest.py diff --git a/src/jobspy/scrapers/goozali/model/GoozaliResponse.py b/src/jobspy/scrapers/goozali/model/GoozaliResponse.py index 13cd65b..461fd9b 100644 --- a/src/jobspy/scrapers/goozali/model/GoozaliResponse.py +++ b/src/jobspy/scrapers/goozali/model/GoozaliResponse.py @@ -1,4 +1,4 @@ -from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData +from jobspy.scrapers.goozali.model import GoozaliResponseData class GoozaliResponse: diff --git a/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py b/src/jobspy/scrapers/goozali/model/GoozaliResponseData.py similarity index 81% rename from src/jobspy/scrapers/goozali/model/GozaaliResponseData.py rename to src/jobspy/scrapers/goozali/model/GoozaliResponseData.py index fb4035c..5755d83 100644 --- a/src/jobspy/scrapers/goozali/model/GozaaliResponseData.py +++ b/src/jobspy/scrapers/goozali/model/GoozaliResponseData.py @@ -1,5 +1,4 @@ -from jobspy.scrapers.goozali.model import GoozaliRow -from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn +from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn class GoozaliResponseData: diff --git a/src/jobspy/scrapers/goozali/model/__init__.py b/src/jobspy/scrapers/goozali/model/__init__.py index e57b3f0..c69310c 100644 --- a/src/jobspy/scrapers/goozali/model/__init__.py +++ b/src/jobspy/scrapers/goozali/model/__init__.py @@ -2,5 +2,8 @@ from .GoozaliRow import GoozaliRow from .GoozaliResponse import GoozaliResponse from .GoozaliColumn import GoozaliColumn from .GoozaliPartRequest import GoozaliPartRequest -from .FullRequest import GoozaliFullRequest +from .GoozaliFullRequest import GoozaliFullRequest from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions +from .GoozaliFieldChoice import GoozaliFieldChoice +from .GoozaliResponseData import GoozaliResponseData +from .GoozaliColumnChoice import GoozaliColumnChoice diff --git a/src/tests/test_goozali.py b/src/tests/test_goozali.py index de490b5..2f59956 100644 --- a/src/tests/test_goozali.py +++ b/src/tests/test_goozali.py @@ -1,14 +1,11 @@ import json import os -from jobspy import scrape_jobs -import pandas as pd from jobspy.jobs import JobPost from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent -from jobspy.scrapers.goozali.constants import CHOICE_FIELD_KEY, extract_goozali_column_name, job_post_column_to_goozali_column -from jobspy.scrapers.goozali.model import GoozaliColumn -from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData +from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column +from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData from jobspy.scrapers.utils import create_dict_by_key_and_value # URL Example # https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D @@ -31,7 +28,7 @@ try: column = component.find_column( response_data.columns, job_post_column_to_goozali_column["field"]) column_choice = component.find_choice_from_column( - column, CHOICE_FIELD_KEY) + column, GoozaliFieldChoice.SOFTWARE_ENGINEERING) filtered_rows_by_column_choice = component.filter_rows_by_column_choice( response_data.rows, column, column_choice)