mirror of https://github.com/Bunsly/JobSpy
format: jobspy
parent
e9b9c22b78
commit
e9804ab4eb
|
@ -72,6 +72,7 @@ def scrape_jobs(
|
||||||
for site in site_name
|
for site in site_name
|
||||||
]
|
]
|
||||||
return site_types
|
return site_types
|
||||||
|
|
||||||
country_enum = Country.from_string(country_indeed)
|
country_enum = Country.from_string(country_indeed)
|
||||||
|
|
||||||
scraper_input = ScraperInput(
|
scraper_input = ScraperInput(
|
||||||
|
@ -88,14 +89,15 @@ def scrape_jobs(
|
||||||
results_wanted=results_wanted,
|
results_wanted=results_wanted,
|
||||||
linkedin_company_ids=linkedin_company_ids,
|
linkedin_company_ids=linkedin_company_ids,
|
||||||
offset=offset,
|
offset=offset,
|
||||||
hours_old=hours_old
|
hours_old=hours_old,
|
||||||
)
|
)
|
||||||
|
|
||||||
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
def scrape_site(site: Site) -> Tuple[str, JobResponse]:
|
||||||
scraper_class = SCRAPER_MAPPING[site]
|
scraper_class = SCRAPER_MAPPING[site]
|
||||||
scraper = scraper_class(proxy=proxy)
|
scraper = scraper_class(proxy=proxy)
|
||||||
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
scraped_data: JobResponse = scraper.scrape(scraper_input)
|
||||||
site_name = 'ZipRecruiter' if site.value.capitalize() == 'Zip_recruiter' else site.value.capitalize()
|
cap_name = site.value.capitalize()
|
||||||
|
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
|
||||||
logger.info(f"{site_name} finished scraping")
|
logger.info(f"{site_name} finished scraping")
|
||||||
return site.value, scraped_data
|
return site.value, scraped_data
|
||||||
|
|
||||||
|
@ -119,9 +121,8 @@ def scrape_jobs(
|
||||||
for site, job_response in site_to_jobs_dict.items():
|
for site, job_response in site_to_jobs_dict.items():
|
||||||
for job in job_response.jobs:
|
for job in job_response.jobs:
|
||||||
job_data = job.dict()
|
job_data = job.dict()
|
||||||
job_data[
|
job_url = job_data["job_url"]
|
||||||
"job_url_hyper"
|
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
||||||
] = f'<a href="{job_data["job_url"]}">{job_data["job_url"]}</a>'
|
|
||||||
job_data["site"] = site
|
job_data["site"] = site
|
||||||
job_data["company"] = job_data["company_name"]
|
job_data["company"] = job_data["company_name"]
|
||||||
job_data["job_type"] = (
|
job_data["job_type"] = (
|
||||||
|
@ -158,7 +159,7 @@ def scrape_jobs(
|
||||||
|
|
||||||
if jobs_dfs:
|
if jobs_dfs:
|
||||||
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
||||||
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
|
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
||||||
|
|
||||||
# Step 2: Concatenate the filtered DataFrames
|
# Step 2: Concatenate the filtered DataFrames
|
||||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||||
|
@ -180,7 +181,6 @@ def scrape_jobs(
|
||||||
"is_remote",
|
"is_remote",
|
||||||
"emails",
|
"emails",
|
||||||
"description",
|
"description",
|
||||||
|
|
||||||
"company_url",
|
"company_url",
|
||||||
"company_url_direct",
|
"company_url_direct",
|
||||||
"company_addresses",
|
"company_addresses",
|
||||||
|
@ -203,6 +203,6 @@ def scrape_jobs(
|
||||||
jobs_df = jobs_df[desired_order]
|
jobs_df = jobs_df[desired_order]
|
||||||
|
|
||||||
# Step 4: Sort the DataFrame as required
|
# Step 4: Sort the DataFrame as required
|
||||||
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
|
return jobs_df.sort_values(by=["site", "date_posted"], ascending=[True, False])
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
Loading…
Reference in New Issue