From f8a4eccc6b8bf206ab98e59d1de43bbc4536409c Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@bunsly.com>
Date: Thu, 29 Feb 2024 21:30:56 -0600
Subject: [PATCH] Remove pandas warning (#118)

---
 poetry.lock                              | 20 +++++++++++++++-----
 pyproject.toml                           |  4 ++--
 src/jobspy/__init__.py                   | 22 +++++++++++++++++++---
 src/jobspy/scrapers/indeed/__init__.py   | 14 ++++++++------
 src/jobspy/scrapers/linkedin/__init__.py |  4 ++--
 5 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d4581f9..20eb44d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -1064,6 +1064,16 @@ files = [
     {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -2271,13 +2281,13 @@ test = ["flake8", "isort", "pytest"]
 
 [[package]]
 name = "tls-client"
-version = "1.0"
+version = "1.0.1"
 description = "Advanced Python HTTP Client."
 optional = false
 python-versions = "*"
 files = [
-    {file = "tls_client-1.0-py3-none-any.whl", hash = "sha256:f1183f5e18cb31914bd62d11b350a33ea0293ea80fb91d69a3072821dece3e66"},
-    {file = "tls_client-1.0.tar.gz", hash = "sha256:7f6de48ad4a0ef69b72682c76ce604155971e07b4bfb2148a36276194ae3e7a0"},
+    {file = "tls_client-1.0.1-py3-none-any.whl", hash = "sha256:2f8915c0642c2226c9e33120072a2af082812f6310d32f4ea4da322db7d3bb1c"},
+    {file = "tls_client-1.0.1.tar.gz", hash = "sha256:dad797f3412bb713606e0765d489f547ffb580c5ffdb74aed47a183ce8505ff5"},
 ]
 
 [[package]]
@@ -2446,4 +2456,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "40cdc19a57cba0d21ff4f0fcfa53e14a073fcccd9f2a871440e056ab6e8fade0"
+content-hash = "eea3694820df164179cdd8312d382eb5b29d6317c4d34c586e8866c69aaee9e9"
diff --git a/pyproject.toml b/pyproject.toml
index 8fd7ba7..42dcf96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.45"
+version = "1.1.46"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
@@ -13,12 +13,12 @@ packages = [
 [tool.poetry.dependencies]
 python = "^3.10"
 requests = "^2.31.0"
-tls-client = "*"
 beautifulsoup4 = "^4.12.2"
 pandas = "^2.1.0"
 NUMPY = "1.24.2"
 pydantic = "^2.3.0"
 html2text = "^2020.1.16"
+tls-client = "^1.0.1"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/jobspy/__init__.py b/src/jobspy/__init__.py
index c4c87d9..866c662 100644
--- a/src/jobspy/__init__.py
+++ b/src/jobspy/__init__.py
@@ -152,8 +152,14 @@ def scrape_jobs(
             jobs_dfs.append(job_df)
 
     if jobs_dfs:
-        jobs_df = pd.concat(jobs_dfs, ignore_index=True)
-        desired_order: list[str] = [
+        # Step 1: Filter out all-NA columns from each DataFrame before concatenation
+        filtered_dfs = [df.dropna(axis=1, how='all') for df in jobs_dfs]
+        
+        # Step 2: Concatenate the filtered DataFrames
+        jobs_df = pd.concat(filtered_dfs, ignore_index=True)
+        
+        # Desired column order
+        desired_order = [
             "job_url_hyper" if hyperlinks else "job_url",
             "site",
             "title",
@@ -172,6 +178,16 @@ def scrape_jobs(
             "emails",
             "description",
         ]
-        return jobs_df[desired_order].sort_values(by=['site', 'date_posted'], ascending=[True, False])
+        
+        # Step 3: Ensure all desired columns are present, adding missing ones as empty
+        for column in desired_order:
+            if column not in jobs_df.columns:
+                jobs_df[column] = None  # Add missing columns as empty
+        
+        # Reorder the DataFrame according to the desired order
+        jobs_df = jobs_df[desired_order]
+        
+        # Step 4: Sort the DataFrame as required
+        return jobs_df.sort_values(by=['site', 'date_posted'], ascending=[True, False])
     else:
         return pd.DataFrame()
diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py
index 27c3d34..acff351 100644
--- a/src/jobspy/scrapers/indeed/__init__.py
+++ b/src/jobspy/scrapers/indeed/__init__.py
@@ -82,7 +82,6 @@ class IndeedScraper(Scraper):
             if not new_jobs:
                 break
 
-
         if len(self.seen_urls) > scraper_input.results_wanted:
             job_list = job_list[:scraper_input.results_wanted]
 
@@ -124,12 +123,15 @@ class IndeedScraper(Scraper):
             return job_list
 
         jobs = IndeedScraper._parse_jobs(soup)
+        if not jobs:
+            return []
         if (
             not jobs.get("metaData", {})
             .get("mosaicProviderJobCardsModel", {})
             .get("results")
         ):
-            raise IndeedException("No jobs found.")
+            logger.error("Indeed - No jobs found.")
+            return []
 
         jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
         job_keys = [job['jobkey'] for job in jobs]
@@ -302,11 +304,11 @@ class IndeedScraper(Scraper):
                 jobs = json.loads(m.group(1).strip())
                 return jobs
             else:
-                raise IndeedException("Could not find mosaic provider job cards data")
+                logger.warning(f'Indeed: Could not find mosaic provider job cards data')
+                return {}
         else:
-            raise IndeedException(
-                "Could not find any results for the search"
-            )
+            logger.warning(f"Indeed: Could not parse any jobs on the page")
+            return {}
 
     @staticmethod
     def _is_job_remote(job: dict, job_detailed: dict, description: str) -> bool:
diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py
index ad17cd4..cecb761 100644
--- a/src/jobspy/scrapers/linkedin/__init__.py
+++ b/src/jobspy/scrapers/linkedin/__init__.py
@@ -104,9 +104,9 @@ class LinkedInScraper(Scraper):
                     return JobResponse(job_list=job_list)
             except Exception as e:
                 if "Proxy responded with" in str(e):
-                    logger.error(f'Indeed: Bad proxy')
+                    logger.error(f'LinkedIn: Bad proxy')
                 else:
-                    logger.error(f'Indeed: {str(e)}')
+                    logger.error(f'LinkedIn: {str(e)}')
                 return JobResponse(job_list=job_list)
 
             soup = BeautifulSoup(response.text, "html.parser")