mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-05 03:54:31 -08:00
Remove pandas warning (#118)
This commit is contained in:
@@ -152,8 +152,14 @@ def scrape_jobs(
|
||||
jobs_dfs.append(job_df)
|
||||
|
||||
if jobs_dfs:
|
||||
jobs_df = pd.concat(jobs_dfs, ignore_index=True)
|
||||
desired_order: list[str] = [
|
||||
# Step 1: Filter out all-NA columns from each DataFrame before concatenation
|
||||
filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
|
||||
|
||||
# Step 2: Concatenate the filtered DataFrames
|
||||
jobs_df = pd.concat(filtered_dfs, ignore_index=True)
|
||||
|
||||
# Desired column order
|
||||
desired_order = [
|
||||
"job_url_hyper" if hyperlinks else "job_url",
|
||||
"site",
|
||||
"title",
|
||||
@@ -172,6 +178,16 @@ def scrape_jobs(
|
||||
"emails",
|
||||
"description",
|
||||
]
|
||||
return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
|
||||
|
||||
# Step 3: Ensure all desired columns are present, adding missing ones as empty
|
||||
for column in desired_order:
|
||||
if column not in jobs_df.columns:
|
||||
jobs_df[column] = None # Add missing columns as empty
|
||||
|
||||
# Reorder the DataFrame according to the desired order
|
||||
jobs_df = jobs_df[desired_order]
|
||||
|
||||
# Step 4: Sort the DataFrame as required
|
||||
return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -82,7 +82,6 @@ class IndeedScraper(Scraper):
|
||||
if not new_jobs:
|
||||
break
|
||||
|
||||
|
||||
if len(self.seen_urls) > scraper_input.results_wanted:
|
||||
job_list = job_list[:scraper_input.results_wanted]
|
||||
|
||||
@@ -124,12 +123,15 @@ class IndeedScraper(Scraper):
|
||||
return job_list
|
||||
|
||||
jobs = IndeedScraper._parse_jobs(soup)
|
||||
if not jobs:
|
||||
return []
|
||||
if (
|
||||
not jobs.get("metaData", {})
|
||||
.get("mosaicProviderJobCardsModel", {})
|
||||
.get("results")
|
||||
):
|
||||
raise IndeedException("No jobs found.")
|
||||
logger.error("Indeed - No jobs found.")
|
||||
return []
|
||||
|
||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
job_keys = [job['jobkey'] for job in jobs]
|
||||
@@ -302,11 +304,11 @@ class IndeedScraper(Scraper):
|
||||
jobs = json.loads(m.group(1).strip())
|
||||
return jobs
|
||||
else:
|
||||
raise IndeedException("Could not find mosaic provider job cards data")
|
||||
logger.warning(f'Indeed: Could not find mosaic provider job cards data')
|
||||
return {}
|
||||
else:
|
||||
raise IndeedException(
|
||||
"Could not find any results for the search"
|
||||
)
|
||||
logger.warning(f"Indeed: Could not parse any jobs on the page")
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
|
||||
|
||||
@@ -104,9 +104,9 @@ class LinkedInScraper(Scraper):
|
||||
return JobResponse(job_list=job_list)
|
||||
except Exception as e:
|
||||
if "Proxy responded with" in str(e):
|
||||
logger.error(f'Indeed: Bad proxy')
|
||||
logger.error(f'LinkedIn: Bad proxy')
|
||||
else:
|
||||
logger.error(f'Indeed: {str(e)}')
|
||||
logger.error(f'LinkedIn: {str(e)}')
|
||||
return JobResponse(job_list=job_list)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
Reference in New Issue
Block a user