diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index 168b9da..8548c9f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -149,135 +149,3 @@ def scrape_jobs( logger.error(f"Future Error occurred: {e}") return merged_jobs - - def convert_to_annual(job_data: dict): - if job_data["interval"] == "hourly": - job_data["min_amount"] *= 2080 - job_data["max_amount"] *= 2080 - if job_data["interval"] == "monthly": - job_data["min_amount"] *= 12 - job_data["max_amount"] *= 12 - if job_data["interval"] == "weekly": - job_data["min_amount"] *= 52 - job_data["max_amount"] *= 52 - if job_data["interval"] == "daily": - job_data["min_amount"] *= 260 - job_data["max_amount"] *= 260 - job_data["interval"] = "yearly" - - jobs_dfs: list[pd.DataFrame] = [] - - for site, job_response in site_to_jobs_dict.items(): - for job in job_response.jobs: - job_data = job.dict() - job_url = job_data["job_url"] - job_data["job_url_hyper"] = f'{job_url}' - job_data["site"] = site - job_data["company"] = job_data["company_name"] - job_data["job_type"] = ( - ", ".join(job_type.value[0] - for job_type in job_data["job_type"]) - if job_data["job_type"] - else None - ) - job_data["emails"] = ( - ", ".join(job_data["emails"]) if job_data["emails"] else None - ) - if job_data["location"]: - job_data["location"] = Location( - **job_data["location"] - ).display_location() - - compensation_obj = job_data.get("compensation") - if compensation_obj and isinstance(compensation_obj, dict): - job_data["interval"] = ( - compensation_obj.get("interval").value - if compensation_obj.get("interval") - else None - ) - job_data["min_amount"] = compensation_obj.get("min_amount") - job_data["max_amount"] = compensation_obj.get("max_amount") - job_data["currency"] = compensation_obj.get("currency", "USD") - job_data["salary_source"] = SalarySource.DIRECT_DATA.value - if enforce_annual_salary and ( - job_data["interval"] - and job_data["interval"] != "yearly" - and job_data["min_amount"] - and job_data["max_amount"] - ): - convert_to_annual(job_data) - - else: - if country_enum == Country.USA: - ( - job_data["interval"], - job_data["min_amount"], - job_data["max_amount"], - job_data["currency"], - ) = extract_salary( - job_data["description"], - enforce_annual_salary=enforce_annual_salary, - ) - job_data["salary_source"] = SalarySource.DESCRIPTION.value - - job_data["salary_source"] = ( - job_data["salary_source"] - if "min_amount" in job_data and job_data["min_amount"] - else None - ) - job_df = pd.DataFrame([job_data]) - jobs_dfs.append(job_df) - - if jobs_dfs: - # Step 1: Filter out all-NA columns from each DataFrame before concatenation - filtered_dfs = [df.dropna(axis=1, how="all") for df in jobs_dfs] - - # Step 2: Concatenate the filtered DataFrames - jobs_df = pd.concat(filtered_dfs, ignore_index=True) - - # Desired column order - desired_order = [ - "id", - "site", - "job_url_hyper" if hyperlinks else "job_url", - "job_url_direct", - "title", - "company", - "location", - "date_posted", - "job_type", - "salary_source", - "interval", - "min_amount", - "max_amount", - "currency", - "is_remote", - "job_level", - "job_function", - "listing_type", - "emails", - "description", - "company_industry", - "company_url", - "company_logo", - "company_url_direct", - "company_addresses", - "company_num_employees", - "company_revenue", - "company_description", - ] - - # Step 3: Ensure all desired columns are present, adding missing ones as empty - for column in desired_order: - if column not in jobs_df.columns: - jobs_df[column] = None # Add missing columns as empty - - # Reorder the DataFrame according to the desired order - jobs_df = jobs_df[desired_order] - - # Step 4: Sort the DataFrame as required - return jobs_df.sort_values( - by=["site", "date_posted"], ascending=[True, False] - ).reset_index(drop=True) - else: - return pd.DataFrame()