mirror of https://github.com/Bunsly/JobSpy
78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
|
from jobspy import scrape_jobs
|
||
|
import pandas as pd
|
||
|
import os
|
||
|
import time
|
||
|
|
||
|
# creates csv a new filename if the jobs.csv already exists.
|
||
|
csv_filename = "jobs.csv"
|
||
|
counter = 1
|
||
|
while os.path.exists(csv_filename):
|
||
|
csv_filename = f"jobs_{counter}.csv"
|
||
|
counter += 1
|
||
|
|
||
|
# results wanted and offset
|
||
|
results_wanted = 1000
|
||
|
offset = 0
|
||
|
|
||
|
all_jobs = []
|
||
|
|
||
|
# max retries
|
||
|
max_retries = 3
|
||
|
|
||
|
# nuumber of results at each iteration
|
||
|
results_in_each_iteration = 30
|
||
|
|
||
|
while len(all_jobs) < results_wanted:
|
||
|
retry_count = 0
|
||
|
while retry_count < max_retries:
|
||
|
print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
|
||
|
try:
|
||
|
jobs = scrape_jobs(
|
||
|
site_name=["indeed"],
|
||
|
search_term="software engineer",
|
||
|
# New York, NY
|
||
|
# Dallas, TX
|
||
|
|
||
|
# Los Angeles, CA
|
||
|
location="Los Angeles, CA",
|
||
|
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
|
||
|
country_indeed="USA",
|
||
|
offset=offset,
|
||
|
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||
|
)
|
||
|
|
||
|
# Add the scraped jobs to the list
|
||
|
all_jobs.extend(jobs.to_dict('records'))
|
||
|
|
||
|
# Increment the offset for the next page of results
|
||
|
offset += results_in_each_iteration
|
||
|
|
||
|
# Add a delay to avoid rate limiting (you can adjust the delay time as needed)
|
||
|
print(f"Scraped {len(all_jobs)} jobs")
|
||
|
print("Sleeping secs", 100 * (retry_count + 1))
|
||
|
time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests
|
||
|
|
||
|
break # Break out of the retry loop if successful
|
||
|
except Exception as e:
|
||
|
print(f"Error: {e}")
|
||
|
retry_count += 1
|
||
|
print("Sleeping secs before retry", 100 * (retry_count + 1))
|
||
|
time.sleep(100 * (retry_count + 1))
|
||
|
if retry_count >= max_retries:
|
||
|
print("Max retries reached. Exiting.")
|
||
|
break
|
||
|
|
||
|
# DataFrame from the collected job data
|
||
|
jobs_df = pd.DataFrame(all_jobs)
|
||
|
|
||
|
# Formatting
|
||
|
pd.set_option("display.max_columns", None)
|
||
|
pd.set_option("display.max_rows", None)
|
||
|
pd.set_option("display.width", None)
|
||
|
pd.set_option("display.max_colwidth", 50)
|
||
|
|
||
|
print(jobs_df)
|
||
|
|
||
|
jobs_df.to_csv(csv_filename, index=False)
|
||
|
print(f"Outputted to {csv_filename}")
|