mirror of https://github.com/Bunsly/JobSpy
import problem on getting the field choice from main but align classes names
parent
cbe3a97a73
commit
b55287b5ec
|
@ -1,6 +1,4 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from datetime import datetime
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
@ -49,7 +47,7 @@ def scrape_jobs(
|
||||||
hours_old: int = None,
|
hours_old: int = None,
|
||||||
enforce_annual_salary: bool = False,
|
enforce_annual_salary: bool = False,
|
||||||
verbose: int = 2,
|
verbose: int = 2,
|
||||||
**kwargs,
|
** kwargs,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Simultaneously scrapes job data from multiple job sites.
|
Simultaneously scrapes job data from multiple job sites.
|
||||||
|
@ -90,7 +88,6 @@ def scrape_jobs(
|
||||||
return site_types
|
return site_types
|
||||||
|
|
||||||
country_enum = Country.from_string(country_indeed)
|
country_enum = Country.from_string(country_indeed)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
site_type=get_site_type(),
|
site_type=get_site_type(),
|
||||||
country=country_enum,
|
country=country_enum,
|
||||||
|
@ -107,7 +104,7 @@ def scrape_jobs(
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
linkedin_company_ids=linkedin_company_ids,
|
linkedin_company_ids=linkedin_company_ids,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
hours_old=hours_old,
|
hours_old=hours_old
|
||||||
)
|
)
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
|
|
|
@ -35,7 +35,7 @@ async def main():
|
||||||
"Central, Israel", "Rehovot ,Israel"],
|
"Central, Israel", "Rehovot ,Israel"],
|
||||||
results_wanted=200,
|
results_wanted=200,
|
||||||
hours_old=200,
|
hours_old=200,
|
||||||
country_indeed='israel',
|
country_indeed='israel'
|
||||||
)
|
)
|
||||||
logger.info(f"Found {len(jobs)} jobs")
|
logger.info(f"Found {len(jobs)} jobs")
|
||||||
jobs = list(filter(filter_jobs_by_title_name, jobs))
|
jobs = list(filter(filter_jobs_by_title_name, jobs))
|
||||||
|
|
|
@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
|
||||||
|
|
||||||
from jobspy.scrapers.site import Site
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
|
|
||||||
from ..jobs import (
|
from ..jobs import (
|
||||||
Enum,
|
Enum,
|
||||||
BaseModel,
|
BaseModel,
|
||||||
|
|
|
@ -2,10 +2,7 @@ from datetime import datetime
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from jobspy.jobs import JobPost, Location
|
from jobspy.jobs import JobPost, Location
|
||||||
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow
|
from jobspy.scrapers.goozali.model import GoozaliColumnTypeOptions, GoozaliResponse, GoozaliRow, GoozaliColumn, GoozaliColumnChoice, GoozaliResponseData
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
|
||||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
|
||||||
from .constants import job_post_column_to_goozali_column, job_post_column_names
|
from .constants import job_post_column_to_goozali_column, job_post_column_names
|
||||||
|
|
||||||
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from jobspy.scrapers.goozali.model import GoozaliRow
|
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn, GoozaliColumnChoice
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
|
||||||
from jobspy.scrapers.utils import create_logger
|
from jobspy.scrapers.utils import create_logger
|
||||||
|
|
||||||
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
# Mapping function to convert parsed dictionary into GoozaliResponseData
|
||||||
|
|
|
@ -8,11 +8,11 @@ This module contains routines to scrape Goozali.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
from jobspy.scrapers import Scraper, ScraperInput
|
from .. import Scraper, ScraperInput
|
||||||
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||||
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
|
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
|
||||||
from jobspy.scrapers.goozali.constants import CHOICE_FIELD_KEY, extract_goozali_column_name, job_post_column_to_goozali_column
|
from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column
|
||||||
from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliPartRequest, GoozaliFullRequest
|
from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliPartRequest, GoozaliFullRequest
|
||||||
from jobspy.scrapers.site import Site
|
from jobspy.scrapers.site import Site
|
||||||
|
|
||||||
from ..utils import create_dict_by_key_and_value, create_session, create_logger
|
from ..utils import create_dict_by_key_and_value, create_session, create_logger
|
||||||
|
@ -45,7 +45,6 @@ class GoozaliScraper(Scraper):
|
||||||
)
|
)
|
||||||
self.mapper = GoozaliMapper()
|
self.mapper = GoozaliMapper()
|
||||||
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
self.base_url = "https://airtable.com/v0.3/view/{view_id}/readSharedViewData"
|
||||||
self.view_ids = ["viwIOzPYaUGxlA0Jd"]
|
|
||||||
self.component = GoozaliScrapperComponent()
|
self.component = GoozaliScrapperComponent()
|
||||||
|
|
||||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||||
|
@ -56,43 +55,41 @@ class GoozaliScraper(Scraper):
|
||||||
"""
|
"""
|
||||||
self.scraper_input = scraper_input
|
self.scraper_input = scraper_input
|
||||||
job_list: list[JobPost] = []
|
job_list: list[JobPost] = []
|
||||||
seen_ids = set()
|
full_request = GoozaliFullRequest(self.base_url)
|
||||||
for view_id in self.view_ids:
|
part_request = GoozaliPartRequest(self.base_url)
|
||||||
full_request = GoozaliFullRequest(self.base_url)
|
try:
|
||||||
part_request = GoozaliPartRequest(self.base_url)
|
response = self.session.get(
|
||||||
try:
|
url=full_request.url,
|
||||||
response = self.session.get(
|
params=full_request.params,
|
||||||
url=full_request.url,
|
timeout=10,
|
||||||
params=full_request.params,
|
headers=full_request.headers,
|
||||||
timeout=10,
|
cookies=full_request.cookies)
|
||||||
headers=full_request.headers,
|
logger.info(f"response: {str(response)}")
|
||||||
cookies=full_request.cookies)
|
if (response.status_code != 200):
|
||||||
logger.info(f"response: {str(response)}")
|
logger.error(f"Status code: {response.status_code}, Error: {
|
||||||
if (response.status_code != 200):
|
str(response.text)}")
|
||||||
logger.error(f"Status code: {response.status_code}, Error: {
|
return JobResponse(jobs=job_list)
|
||||||
str(response.text)}")
|
except Exception as e:
|
||||||
return JobResponse(jobs=job_list)
|
logger.error(f"Exception: {str(e)}")
|
||||||
except Exception as e:
|
# model the response with models
|
||||||
logger.error(f"Exception: {str(e)}")
|
goozali_response = self.mapper.map_response_to_goozali_response(
|
||||||
# model the response with models
|
response=response)
|
||||||
goozali_response = self.mapper.map_response_to_goozali_response(
|
# suggestL create groupby field and then filter by hours
|
||||||
response=response)
|
# filter result by Field
|
||||||
# suggestL create groupby field and then filter by hours
|
column = self.component.find_column(
|
||||||
# filter result by Field
|
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
|
||||||
column = self.component.find_column(
|
column_choice = self.component.find_choice_from_column(
|
||||||
goozali_response.data.columns, job_post_column_to_goozali_column["field"])
|
column, GoozaliFieldChoice.SOFTWARE_ENGINEERING.value)
|
||||||
column_choice = self.component.find_choice_from_column(
|
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
|
||||||
column, CHOICE_FIELD_KEY)
|
goozali_response.data.rows, column, column_choice)
|
||||||
filtered_rows_by_column_choice = self.component.filter_rows_by_column_choice(
|
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
|
||||||
goozali_response.data.rows, column, column_choice)
|
filtered_rows_by_column_choice, scraper_input.hours_old)
|
||||||
filtered_rows_by_age_and_column_choice = self.component.filter_rows_by_hours(
|
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
|
||||||
filtered_rows_by_column_choice, scraper_input.hours_old)
|
goozali_response.data.columns, extract_goozali_column_name)
|
||||||
dict_column_name_to_column: dict[str, GoozaliColumn] = create_dict_by_key_and_value(
|
# map to JobResponse Object
|
||||||
goozali_response.data.columns, extract_goozali_column_name)
|
for row in filtered_rows_by_age_and_column_choice:
|
||||||
# map to JobResponse Object
|
job_post = self.mapper.map_goozali_response_to_job_post(
|
||||||
for row in filtered_rows_by_age_and_column_choice:
|
row, dict_column_name_to_column)
|
||||||
job_post = self.mapper.map_goozali_response_to_job_post(
|
job_list.append(job_post)
|
||||||
row, dict_column_name_to_column)
|
|
||||||
job_list.append(job_post)
|
|
||||||
|
|
||||||
return JobResponse(jobs=job_list)
|
return JobResponse(jobs=job_list)
|
||||||
|
|
|
@ -13,8 +13,6 @@ job_post_column_to_goozali_column = {
|
||||||
"id": "Job ID"
|
"id": "Job ID"
|
||||||
}
|
}
|
||||||
|
|
||||||
CHOICE_FIELD_KEY = "Software Engineering"
|
|
||||||
|
|
||||||
job_post_column_names = ["id",
|
job_post_column_names = ["id",
|
||||||
"date_posted",
|
"date_posted",
|
||||||
"field",
|
"field",
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumnChoice import GoozaliColumnChoice
|
from jobspy.scrapers.goozali.model import GoozaliColumnChoice
|
||||||
|
|
||||||
|
|
||||||
class GoozaliColumnTypeOptions:
|
class GoozaliColumnTypeOptions:
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class GoozaliFieldChoice(Enum):
|
||||||
|
PRODUCT_MANAGEMENT = "Product Management"
|
||||||
|
DATA_ANALYST = "Data Analyst"
|
||||||
|
DATA_SCIENCE_ML_ALGORITHMS = "Data Science, ML & Algorithms"
|
||||||
|
SOFTWARE_ENGINEERING = "Software Engineering"
|
||||||
|
QA = "QA"
|
||||||
|
CYBERSECURITY = "Cybersecurity"
|
||||||
|
IT_AND_SYSTEM_ADMINISTRATION = "IT and System Administration"
|
||||||
|
FRONTEND_DEVELOPMENT = "Frontend Development"
|
||||||
|
DEVOPS = "DevOps"
|
||||||
|
UI_UX_DESIGN_CONTENT = "UI/UX, Design & Content"
|
||||||
|
HR_RECRUITMENT = "HR & Recruitment"
|
||||||
|
MOBILE_DEVELOPMENT = "Mobile Development"
|
||||||
|
HARDWARE_ENGINEERING = "Hardware Engineering"
|
||||||
|
EMBEDDED_LOW_LEVEL_FIRMWARE_ENGINEERING = "Embedded, Low Level & Firmware Engineering"
|
||||||
|
CUSTOMER_SUCCESS = "Customer Success"
|
||||||
|
PROJECT_MANAGEMENT = "Project Management"
|
||||||
|
OPERATIONS = "Operations"
|
||||||
|
FINANCE = "Finance"
|
||||||
|
SYSTEMS_ENGINEERING = "Systems Engineering"
|
||||||
|
MARKETING = "Marketing"
|
||||||
|
SALES = "Sales"
|
||||||
|
COMPLIANCE_LEGAL_POLICY = "Compliance, Legal & Policy"
|
||||||
|
C_LEVEL = "C-Level"
|
||||||
|
BUSINESS_DEVELOPMENT = "Business Development"
|
||||||
|
MECHANICAL_ENGINEERING = "Mechanical Engineering"
|
||||||
|
NATURAL_SCIENCE = "Natural Science"
|
||||||
|
OTHER = "Other"
|
|
@ -1,4 +1,4 @@
|
||||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
from jobspy.scrapers.goozali.model import GoozaliResponseData
|
||||||
|
|
||||||
|
|
||||||
class GoozaliResponse:
|
class GoozaliResponse:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from jobspy.scrapers.goozali.model import GoozaliRow
|
from jobspy.scrapers.goozali.model import GoozaliRow, GoozaliColumn
|
||||||
from jobspy.scrapers.goozali.model.GoozaliColumn import GoozaliColumn
|
|
||||||
|
|
||||||
|
|
||||||
class GoozaliResponseData:
|
class GoozaliResponseData:
|
|
@ -2,5 +2,8 @@ from .GoozaliRow import GoozaliRow
|
||||||
from .GoozaliResponse import GoozaliResponse
|
from .GoozaliResponse import GoozaliResponse
|
||||||
from .GoozaliColumn import GoozaliColumn
|
from .GoozaliColumn import GoozaliColumn
|
||||||
from .GoozaliPartRequest import GoozaliPartRequest
|
from .GoozaliPartRequest import GoozaliPartRequest
|
||||||
from .FullRequest import GoozaliFullRequest
|
from .GoozaliFullRequest import GoozaliFullRequest
|
||||||
from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions
|
from .GoozaliColumnTypeOptions import GoozaliColumnTypeOptions
|
||||||
|
from .GoozaliFieldChoice import GoozaliFieldChoice
|
||||||
|
from .GoozaliResponseData import GoozaliResponseData
|
||||||
|
from .GoozaliColumnChoice import GoozaliColumnChoice
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from jobspy import scrape_jobs
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from jobspy.jobs import JobPost
|
from jobspy.jobs import JobPost
|
||||||
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
from jobspy.scrapers.goozali.GoozaliMapper import GoozaliMapper
|
||||||
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
|
from jobspy.scrapers.goozali.GoozaliScrapperComponent import GoozaliScrapperComponent
|
||||||
from jobspy.scrapers.goozali.constants import CHOICE_FIELD_KEY, extract_goozali_column_name, job_post_column_to_goozali_column
|
from jobspy.scrapers.goozali.constants import extract_goozali_column_name, job_post_column_to_goozali_column
|
||||||
from jobspy.scrapers.goozali.model import GoozaliColumn
|
from jobspy.scrapers.goozali.model import GoozaliColumn, GoozaliFieldChoice, GoozaliResponseData
|
||||||
from jobspy.scrapers.goozali.model.GozaaliResponseData import GoozaliResponseData
|
|
||||||
from jobspy.scrapers.utils import create_dict_by_key_and_value
|
from jobspy.scrapers.utils import create_dict_by_key_and_value
|
||||||
# URL Example
|
# URL Example
|
||||||
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
# https://airtable.com/v0.3/view/viwagEIbkfz2iMsLU/readSharedViewData?stringifiedObjectParams=%7B%22shouldUseNestedResponseFormat%22%3Atrue%7D&requestId=reqXyRSHWlXyiRgY9&accessPolicy=%7B%22allowedActions%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viwagEIbkfz2iMsLU%22%2C%22action%22%3A%22readSignedAttachmentUrls%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows%20*%5BdisplayedInView%3DviwagEIbkfz2iMsLU%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%5D%2C%22shareId%22%3A%22shr97tl6luEk4Ca9R%22%2C%22applicationId%22%3A%22app5sYJyDgcRbJWYU%22%2C%22generationNumber%22%3A0%2C%22expires%22%3A%222025-01-02T00%3A00%3A00.000Z%22%2C%22signature%22%3A%223aa292ee44d15aa75d9506200329e413653471f89e000fa370ef9fa38393070a%22%7D
|
||||||
|
@ -31,7 +28,7 @@ try:
|
||||||
column = component.find_column(
|
column = component.find_column(
|
||||||
response_data.columns, job_post_column_to_goozali_column["field"])
|
response_data.columns, job_post_column_to_goozali_column["field"])
|
||||||
column_choice = component.find_choice_from_column(
|
column_choice = component.find_choice_from_column(
|
||||||
column, CHOICE_FIELD_KEY)
|
column, GoozaliFieldChoice.SOFTWARE_ENGINEERING)
|
||||||
|
|
||||||
filtered_rows_by_column_choice = component.filter_rows_by_column_choice(
|
filtered_rows_by_column_choice = component.filter_rows_by_column_choice(
|
||||||
response_data.rows, column, column_choice)
|
response_data.rows, column, column_choice)
|
||||||
|
|
Loading…
Reference in New Issue