format: jobspy

pull/127/head
VitaminB16 2024-03-09 19:05:36 +00:00
parent e9b9c22b78
commit e9804ab4eb
1 changed files with 13 additions and 13 deletions

View File

@ -72,6 +72,7 @@ def scrape_jobs(
for site in site_name for site in site_name
] ]
return site_types return site_types
country_enum = Country.from_string(country_indeed) country_enum = Country.from_string(country_indeed)
scraper_input = ScraperInput( scraper_input = ScraperInput(
@ -88,14 +89,15 @@ def scrape_jobs(
results_wanted=results_wanted, results_wanted=results_wanted,
linkedin_company_ids=linkedin_company_ids, linkedin_company_ids=linkedin_company_ids,
offset=offset, offset=offset,
hours_old=hours_old hours_old=hours_old,
) )
def scrape_site(site: Site) -> Tuple[str, JobResponse]: def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site] scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxy=proxy) scraper = scraper_class(proxy=proxy)
scraped_data: JobResponse = scraper.scrape(scraper_input) scraped_data: JobResponse = scraper.scrape(scraper_input)
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize() cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
logger.info(f"{site_name} finished scraping") logger.info(f"{site_name} finished scraping")
return site.value, scraped_data return site.value, scraped_data
@ -119,9 +121,8 @@ def scrape_jobs(
for site, job_response in site_to_jobs_dict.items(): for site, job_response in site_to_jobs_dict.items():
for job in job_response.jobs: for job in job_response.jobs:
job_data = job.dict() job_data = job.dict()
job_data[ job_url = job_data["job_url"]
"job_url_hyper" job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
job_data["site"] = site job_data["site"] = site
job_data["company"] = job_data["company_name"] job_data["company"] = job_data["company_name"]
job_data["job_type"] = ( job_data["job_type"] = (
@ -158,11 +159,11 @@ def scrape_jobs(
if jobs_dfs: if jobs_dfs:
# Step 1: Filter out all-NA columns from each DataFrame before concatenation # Step 1: Filter out all-NA columns from each DataFrame before concatenation
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs] filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
# Step 2: Concatenate the filtered DataFrames # Step 2: Concatenate the filtered DataFrames
jobs_df = pd.concat(filtered_dfs, ignore_index=True) jobs_df = pd.concat(filtered_dfs, ignore_index=True)
# Desired column order # Desired column order
desired_order = [ desired_order = [
"site", "site",
@ -180,7 +181,6 @@ def scrape_jobs(
"is_remote", "is_remote",
"emails", "emails",
"description", "description",
"company_url", "company_url",
"company_url_direct", "company_url_direct",
"company_addresses", "company_addresses",
@ -193,16 +193,16 @@ def scrape_jobs(
"ceo_name", "ceo_name",
"ceo_photo_url", "ceo_photo_url",
] ]
# Step 3: Ensure all desired columns are present, adding missing ones as empty # Step 3: Ensure all desired columns are present, adding missing ones as empty
for column in desired_order: for column in desired_order:
if column not in jobs_df.columns: if column not in jobs_df.columns:
jobs_df[column] = None # Add missing columns as empty jobs_df[column] = None # Add missing columns as empty
# Reorder the DataFrame according to the desired order # Reorder the DataFrame according to the desired order
jobs_df = jobs_df[desired_order] jobs_df = jobs_df[desired_order]
# Step 4: Sort the DataFrame as required # Step 4: Sort the DataFrame as required
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False]) return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
else: else:
return pd.DataFrame() return pd.DataFrame()