mirror of https://github.com/Bunsly/JobSpy
removed the code parse the data to csv
parent
79189f49ef
commit
54022f2b57
|
@ -149,135 +149,3 @@ def scrape_jobs(
|
|||
logger.error(f"Future Error occurred: {e}")
|
||||
|
||||
return merged_jobs
|
||||
|
||||
def convert_to_annual(job_data: dict):
|
||||
if job_data["interval"] == "hourly":
|
||||
job_data["min_amount"] *= 2080
|
||||
job_data["max_amount"] *= 2080
|
||||
if job_data["interval"] == "monthly":
|
||||
job_data["min_amount"] *= 12
|
||||
job_data["max_amount"] *= 12
|
||||
if job_data["interval"] == "weekly":
|
||||
job_data["min_amount"] *= 52
|
||||
job_data["max_amount"] *= 52
|
||||
if job_data["interval"] == "daily":
|
||||
job_data["min_amount"] *= 260
|
||||
job_data["max_amount"] *= 260
|
||||
job_data["interval"] = "yearly"
|
||||
|
||||
jobs_dfs: list[pd.DataFrame] = []
|
||||
|
||||
for site, job_response in site_to_jobs_dict.items():
|
||||
for job in job_response.jobs:
|
||||
job_data = job.dict()
|
||||
job_url = job_data["job_url"]
|
||||
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
||||
job_data["site"] = site
|
||||
job_data["company"] = job_data["company_name"]
|
||||
job_data["job_type"] = (
|
||||
", ".join(job_type.value[0]
|
||||
for job_type in job_data["job_type"])
|
||||
if job_data["job_type"]
|
||||
else None
|
||||
)
|
||||
job_data["emails"] = (
|
||||
", ".join(job_data["emails"]) if job_data["emails"] else None
|
||||
)
|
||||
if job_data["location"]:
|
||||
job_data["location"] = Location(
|
||||
**job_data["location"]
|
||||
).display_location()
|
||||
|
||||
compensation_obj = job_data.get("compensation")
|
||||
if compensation_obj and isinstance(compensation_obj, dict):
|
||||
job_data["interval"] = (
|
||||
compensation_obj.get("interval").value
|
||||
if compensation_obj.get("interval")
|
||||
else None
|
||||
)
|
||||
job_data["min_amount"] = compensation_obj.get("min_amount")
|
||||
job_data["max_amount"] = compensation_obj.get("max_amount")
|
||||
job_data["currency"] = compensation_obj.get("currency", "USD")
|
||||
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
||||
if enforce_annual_salary and (
|
||||
job_data["interval"]
|
||||
and job_data["interval"] != "yearly"
|
||||
and job_data["min_amount"]
|
||||
and job_data["max_amount"]
|
||||
):
|
||||
convert_to_annual(job_data)
|
||||
|
||||
else:
|
||||
if country_enum == Country.USA:
|
||||
(
|
||||
job_data["interval"],
|
||||
job_data["min_amount"],
|
||||
job_data["max_amount"],
|
||||
job_data["currency"],
|
||||
) = extract_salary(
|
||||
job_data["description"],
|
||||
enforce_annual_salary=enforce_annual_salary,
|
||||
)
|
||||
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
||||
|
||||
job_data["salary_source"] = (
|
||||
job_data["salary_source"]
|
||||
if "min_amount" in job_data and job_data["min_amount"]
|
||||
else None
|
||||
)
|
||||
job_df = pd.DataFrame([job_data])
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
if jobs_dfs:
|
||||
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
||||
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
||||
|
||||
# Step 2: Concatenate the filtered DataFrames
|
||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||
|
||||
# Desired column order
|
||||
desired_order = [
|
||||
"id",
|
||||
"site",
|
||||
"job_url_hyper" if hyperlinks else "job_url",
|
||||
"job_url_direct",
|
||||
"title",
|
||||
"company",
|
||||
"location",
|
||||
"date_posted",
|
||||
"job_type",
|
||||
"salary_source",
|
||||
"interval",
|
||||
"min_amount",
|
||||
"max_amount",
|
||||
"currency",
|
||||
"is_remote",
|
||||
"job_level",
|
||||
"job_function",
|
||||
"listing_type",
|
||||
"emails",
|
||||
"description",
|
||||
"company_industry",
|
||||
"company_url",
|
||||
"company_logo",
|
||||
"company_url_direct",
|
||||
"company_addresses",
|
||||
"company_num_employees",
|
||||
"company_revenue",
|
||||
"company_description",
|
||||
]
|
||||
|
||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||
for column in desired_order:
|
||||
if column not in jobs_df.columns:
|
||||
jobs_df[column] = None # Add missing columns as empty
|
||||
|
||||
# Reorder the DataFrame according to the desired order
|
||||
jobs_df = jobs_df[desired_order]
|
||||
|
||||
# Step 4: Sort the DataFrame as required
|
||||
return jobs_df.sort_values(
|
||||
by=["site", "date_posted"], ascending=[True, False]
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
|
Loading…
Reference in New Issue