From 9a3dff3c0f742149b2248e389ac422d78c0cb632 Mon Sep 17 00:00:00 2001 From: troy-conte <102304795+troy-conte@users.noreply.github.com> Date: Thu, 29 Feb 2024 16:13:47 -0500 Subject: [PATCH] Update __init__.py concat expire warning handled --- src/jobspy/__init__.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py index c4c87d9..9aa3e9f 100644 --- a/src/jobspy/__init__.py +++ b/src/jobspy/__init__.py @@ -152,9 +152,15 @@ def scrape_jobs( jobs_dfs.append(job_df) if jobs_dfs: - jobs_df = pd.concat(jobs_dfs, ignore_index=True) - desired_order: list[str] = [ - "job_url_hyper" if hyperlinks else "job_url", + # Step 1: Filter out all-NA columns from each DataFrame before concatenation + filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs] + + # Step 2: Concatenate the filtered DataFrames + jobs_df = pd.concat(filtered_dfs, ignore_index=True) + + # Desired column order + desired_order = [ + "job_url_hyper" if 'hyperlinks' in locals() or 'hyperlinks' in globals() else "job_url", "site", "title", "company", @@ -172,6 +178,16 @@ def scrape_jobs( "emails", "description", ] - return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False]) + + # Step 3: Ensure all desired columns are present, adding missing ones as empty + for column in desired_order: + if column not in jobs_df.columns: + jobs_df[column] = None # Add missing columns as empty + + # Reorder the DataFrame according to the desired order + jobs_df = jobs_df[desired_order] + + # Step 4: Sort the DataFrame as required + return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False]) else: return pd.DataFrame()