mirror of https://github.com/Bunsly/JobSpy
removed the code parse the data to csv
parent
79189f49ef
commit
54022f2b57
|
@ -149,135 +149,3 @@ def scrape_jobs(
|
||||||
logger.error(f"Future Error occurred: {e}")
|
logger.error(f"Future Error occurred: {e}")
|
||||||
|
|
||||||
return merged_jobs
|
return merged_jobs
|
||||||
|
|
||||||
def convert_to_annual(job_data: dict):
|
|
||||||
if job_data["interval"] == "hourly":
|
|
||||||
job_data["min_amount"] *= 2080
|
|
||||||
job_data["max_amount"] *= 2080
|
|
||||||
if job_data["interval"] == "monthly":
|
|
||||||
job_data["min_amount"] *= 12
|
|
||||||
job_data["max_amount"] *= 12
|
|
||||||
if job_data["interval"] == "weekly":
|
|
||||||
job_data["min_amount"] *= 52
|
|
||||||
job_data["max_amount"] *= 52
|
|
||||||
if job_data["interval"] == "daily":
|
|
||||||
job_data["min_amount"] *= 260
|
|
||||||
job_data["max_amount"] *= 260
|
|
||||||
job_data["interval"] = "yearly"
|
|
||||||
|
|
||||||
jobs_dfs: list[pd.DataFrame] = []
|
|
||||||
|
|
||||||
for site, job_response in site_to_jobs_dict.items():
|
|
||||||
for job in job_response.jobs:
|
|
||||||
job_data = job.dict()
|
|
||||||
job_url = job_data["job_url"]
|
|
||||||
job_data["job_url_hyper"] = f'<a href="{job_url}">{job_url}</a>'
|
|
||||||
job_data["site"] = site
|
|
||||||
job_data["company"] = job_data["company_name"]
|
|
||||||
job_data["job_type"] = (
|
|
||||||
", ".join(job_type.value[0]
|
|
||||||
for job_type in job_data["job_type"])
|
|
||||||
if job_data["job_type"]
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
job_data["emails"] = (
|
|
||||||
", ".join(job_data["emails"]) if job_data["emails"] else None
|
|
||||||
)
|
|
||||||
if job_data["location"]:
|
|
||||||
job_data["location"] = Location(
|
|
||||||
**job_data["location"]
|
|
||||||
).display_location()
|
|
||||||
|
|
||||||
compensation_obj = job_data.get("compensation")
|
|
||||||
if compensation_obj and isinstance(compensation_obj, dict):
|
|
||||||
job_data["interval"] = (
|
|
||||||
compensation_obj.get("interval").value
|
|
||||||
if compensation_obj.get("interval")
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
job_data["min_amount"] = compensation_obj.get("min_amount")
|
|
||||||
job_data["max_amount"] = compensation_obj.get("max_amount")
|
|
||||||
job_data["currency"] = compensation_obj.get("currency", "USD")
|
|
||||||
job_data["salary_source"] = SalarySource.DIRECT_DATA.value
|
|
||||||
if enforce_annual_salary and (
|
|
||||||
job_data["interval"]
|
|
||||||
and job_data["interval"] != "yearly"
|
|
||||||
and job_data["min_amount"]
|
|
||||||
and job_data["max_amount"]
|
|
||||||
):
|
|
||||||
convert_to_annual(job_data)
|
|
||||||
|
|
||||||
else:
|
|
||||||
if country_enum == Country.USA:
|
|
||||||
(
|
|
||||||
job_data["interval"],
|
|
||||||
job_data["min_amount"],
|
|
||||||
job_data["max_amount"],
|
|
||||||
job_data["currency"],
|
|
||||||
) = extract_salary(
|
|
||||||
job_data["description"],
|
|
||||||
enforce_annual_salary=enforce_annual_salary,
|
|
||||||
)
|
|
||||||
job_data["salary_source"] = SalarySource.DESCRIPTION.value
|
|
||||||
|
|
||||||
job_data["salary_source"] = (
|
|
||||||
job_data["salary_source"]
|
|
||||||
if "min_amount" in job_data and job_data["min_amount"]
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
job_df = pd.DataFrame([job_data])
|
|
||||||
jobs_dfs.append(job_df)
|
|
||||||
|
|
||||||
if jobs_dfs:
|
|
||||||
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
|
||||||
filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs]
|
|
||||||
|
|
||||||
# Step 2: Concatenate the filtered DataFrames
|
|
||||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
|
||||||
|
|
||||||
# Desired column order
|
|
||||||
desired_order = [
|
|
||||||
"id",
|
|
||||||
"site",
|
|
||||||
"job_url_hyper" if hyperlinks else "job_url",
|
|
||||||
"job_url_direct",
|
|
||||||
"title",
|
|
||||||
"company",
|
|
||||||
"location",
|
|
||||||
"date_posted",
|
|
||||||
"job_type",
|
|
||||||
"salary_source",
|
|
||||||
"interval",
|
|
||||||
"min_amount",
|
|
||||||
"max_amount",
|
|
||||||
"currency",
|
|
||||||
"is_remote",
|
|
||||||
"job_level",
|
|
||||||
"job_function",
|
|
||||||
"listing_type",
|
|
||||||
"emails",
|
|
||||||
"description",
|
|
||||||
"company_industry",
|
|
||||||
"company_url",
|
|
||||||
"company_logo",
|
|
||||||
"company_url_direct",
|
|
||||||
"company_addresses",
|
|
||||||
"company_num_employees",
|
|
||||||
"company_revenue",
|
|
||||||
"company_description",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
|
||||||
for column in desired_order:
|
|
||||||
if column not in jobs_df.columns:
|
|
||||||
jobs_df[column] = None # Add missing columns as empty
|
|
||||||
|
|
||||||
# Reorder the DataFrame according to the desired order
|
|
||||||
jobs_df = jobs_df[desired_order]
|
|
||||||
|
|
||||||
# Step 4: Sort the DataFrame as required
|
|
||||||
return jobs_df.sort_values(
|
|
||||||
by=["site", "date_posted"], ascending=[True, False]
|
|
||||||
).reset_index(drop=True)
|
|
||||||
else:
|
|
||||||
return pd.DataFrame()
|
|
||||||
|
|
Loading…
Reference in New Issue