From 89a5264391d9f47781112bd942c347b3062a4358 Mon Sep 17 00:00:00 2001
From: Harish Vadaparty <harishvadaparty@gmail.com>
Date: Fri, 12 Jan 2024 23:54:00 +0530
Subject: [PATCH] add long scrape example (#81)

---
 .../{JobSpy_Demo.py => JobSpy_AllSites.py}    |  7 +-
 examples/JobSpy_LongScrape.py                 | 77 +++++++++++++++++++
 2 files changed, 80 insertions(+), 4 deletions(-)
 rename examples/{JobSpy_Demo.py => JobSpy_AllSites.py} (72%)
 create mode 100644 examples/JobSpy_LongScrape.py

diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_AllSites.py
similarity index 72%
rename from examples/JobSpy_Demo.py
rename to examples/JobSpy_AllSites.py
index c982793..22deea0 100644
--- a/examples/JobSpy_Demo.py
+++ b/examples/JobSpy_AllSites.py
@@ -2,12 +2,11 @@ from jobspy import scrape_jobs
 import pandas as pd
 
 jobs: pd.DataFrame = scrape_jobs(
-    site_name=["indeed", "linkedin", "zip_recruiter"],
+    site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
     search_term="software engineer",
     location="Dallas, TX",
-    results_wanted=50,  # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
+    results_wanted=25,  # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho)
     country_indeed="USA",
-    offset=25  # start jobs from an offset (use if search failed and want to continue)
     # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
 )
 
@@ -28,4 +27,4 @@ print("outputted to jobs.csv")
 # jobs.to_xlsx('jobs.xlsx', index=False)
 
 # 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
-# display(jobs)
+# display(jobs)
\ No newline at end of file
diff --git a/examples/JobSpy_LongScrape.py b/examples/JobSpy_LongScrape.py
new file mode 100644
index 0000000..189ca81
--- /dev/null
+++ b/examples/JobSpy_LongScrape.py
@@ -0,0 +1,77 @@
+from jobspy import scrape_jobs
+import pandas as pd
+import os
+import time
+
+# creates csv a new filename if the jobs.csv already exists.
+csv_filename = "jobs.csv"
+counter = 1
+while os.path.exists(csv_filename):
+    csv_filename = f"jobs_{counter}.csv"
+    counter += 1
+
+# results wanted and offset
+results_wanted = 1000
+offset = 0
+
+all_jobs = []
+
+# max retries
+max_retries = 3
+
+# nuumber of results at each iteration
+results_in_each_iteration = 30
+
+while len(all_jobs) < results_wanted:
+    retry_count = 0
+    while retry_count < max_retries:
+        print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
+        try:
+            jobs = scrape_jobs(
+                site_name=["indeed"],
+                search_term="software engineer",
+                # New York, NY
+                # Dallas, TX
+
+                # Los Angeles, CA
+                location="Los Angeles, CA",
+                results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
+                country_indeed="USA",
+                offset=offset,
+                # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
+            )
+
+            # Add the scraped jobs to the list
+            all_jobs.extend(jobs.to_dict('records'))
+
+            # Increment the offset for the next page of results
+            offset += results_in_each_iteration
+
+            # Add a delay to avoid rate limiting (you can adjust the delay time as needed)
+            print(f"Scraped {len(all_jobs)} jobs")
+            print("Sleeping secs", 100 * (retry_count + 1))
+            time.sleep(100 * (retry_count + 1))  # Sleep for 2 seconds between requests
+
+            break  # Break out of the retry loop if successful
+        except Exception as e:
+            print(f"Error: {e}")
+            retry_count += 1
+            print("Sleeping secs before retry", 100 * (retry_count + 1))
+            time.sleep(100 * (retry_count + 1))
+            if retry_count >= max_retries:
+                print("Max retries reached. Exiting.")
+                break
+
+# DataFrame from the collected job data
+jobs_df = pd.DataFrame(all_jobs)
+
+# Formatting
+pd.set_option("display.max_columns", None)
+pd.set_option("display.max_rows", None)
+pd.set_option("display.width", None)
+pd.set_option("display.max_colwidth", 50)
+
+print(jobs_df)
+
+jobs_df.to_csv(csv_filename, index=False)
+print(f"Outputted to {csv_filename}")