updated example with exponential backoff and retry mechanism

Exponential Backoff can help us to minimize the 429 Too many requests issue, and the try catch handle helps to extract data even if we face error, can be used to scrape 500-100 jobs in one run
2024-01-12 12:25:33 +05:30 · 2024-01-12 12:25:33 +05:30 · 36d53b01f9
parent a7ad616567
commit 36d53b01f9
1 changed files with 68 additions and 22 deletions
--- a/examples/JobSpy_Demo.py
+++ b/examples/JobSpy_Demo.py
@ -1,31 +1,77 @@
 from jobspy import scrape_jobs
 import pandas as pd
+import os
+import time

-jobs: pd.DataFrame = scrape_jobs(
-    site_name=["indeed", "linkedin", "zip_recruiter"],
-    search_term="software engineer",
-    location="Dallas, TX",
-    results_wanted=50,  # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
-    country_indeed="USA",
-    offset=25  # start jobs from an offset (use if search failed and want to continue)
-    # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
-)
+# creates csv a new filename if the jobs.csv already exists.
+csv_filename = "jobs.csv"
+counter = 1
+while os.path.exists(csv_filename):
+    csv_filename = f"jobs_{counter}.csv"
+    counter += 1

-# formatting for pandas
+# results wanted and offset
+results_wanted = 1000
+offset = 0
+
+all_jobs = []
+
+# max retries
+max_retries = 3
+
+# nuumber of results at each iteration
+results_in_each_iteration = 50
+
+while len(all_jobs) < results_wanted:
+    retry_count = 0
+    while retry_count < max_retries:
+        print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
+        try:
+            jobs = scrape_jobs(
+                site_name=["indeed"],
+                search_term="software engineer",
+                # New York, NY
+                # Dallas, TX
+
+                # Los Angeles, CA
+                location="Los Angeles, CA",
+                results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
+                country_indeed="USA",
+                offset=offset,
+                # proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
+            )
+
+            # Add the scraped jobs to the list
+            all_jobs.extend(jobs.to_dict('records'))
+
+            # Increment the offset for the next page of results
+            offset += results_in_each_iteration
+
+            # Add a delay to avoid rate limiting (you can adjust the delay time as needed)
+            print(f"Scraped {len(all_jobs)} jobs")
+            print("Sleeping secs", 100 * (retry_count + 1))
+            time.sleep(100 * (retry_count + 1))  # Sleep for 2 seconds between requests
+
+            break  # Break out of the retry loop if successful
+        except Exception as e:
+            print(f"Error: {e}")
+            retry_count += 1
+            print("Sleeping secs before retry", 100 * (retry_count + 1))
+            time.sleep(100 * (retry_count + 1))
+            if retry_count >= max_retries:
+                print("Max retries reached. Exiting.")
+                break
+
+# DataFrame from the collected job data
+jobs_df = pd.DataFrame(all_jobs)
+
+# Formatting
 pd.set_option("display.max_columns", None)
 pd.set_option("display.max_rows", None)
 pd.set_option("display.width", None)
-pd.set_option("display.max_colwidth", 50)  # set to 0 to see full job url / desc
+pd.set_option("display.max_colwidth", 50)

-# 1: output to console
-print(jobs)
+print(jobs_df)

-# 2: output to .csv
-jobs.to_csv("./jobs.csv", index=False)
-print("outputted to jobs.csv")
-
-# 3: output to .xlsx
-# jobs.to_xlsx('jobs.xlsx', index=False)
-
-# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
-# display(jobs)
+jobs_df.to_csv(csv_filename, index=False)
+print(f"Outputted to {csv_filename}")