mirror of
https://github.com/Bunsly/JobSpy.git
synced 2026-03-04 19:44:30 -08:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e2ab277da | ||
|
|
ce3bd84ee5 | ||
|
|
1ccf2290fe | ||
|
|
ec2eefc58a | ||
|
|
13c7694474 | ||
|
|
bbe46fe3f4 | ||
|
|
b97c73ffd6 | ||
|
|
5b3627b244 | ||
|
|
2ec3b04777 | ||
|
|
89a5264391 | ||
|
|
a7ad616567 | ||
|
|
53bc33a43a | ||
|
|
22870438c7 | ||
|
|
aeb93b99f5 | ||
|
|
a5916edcdd | ||
|
|
33d442bf1e | ||
|
|
6587e464fa | ||
|
|
eed7fca300 |
16
README.md
16
README.md
@@ -5,10 +5,7 @@
|
||||
**Not technical?** Try out the web scraping tool on our site at [usejobspy.com](https://usejobspy.com).
|
||||
|
||||
*Looking to build a data-focused software product?* **[Book a call](https://bunsly.com/)** *to
|
||||
work with us.*
|
||||
|
||||
Check out another project we wrote: ***[HomeHarvest](https://github.com/Bunsly/HomeHarvest)** – a Python package
|
||||
for real estate scraping*
|
||||
work with us.*
|
||||
|
||||
## Features
|
||||
|
||||
@@ -70,8 +67,9 @@ Optional
|
||||
├── job_type (enum): fulltime, parttime, internship, contract
|
||||
├── proxy (str): in format 'http://user:pass@host:port' or [https, socks]
|
||||
├── is_remote (bool)
|
||||
├── full_description (bool): fetches full description for Indeed / LinkedIn (much slower)
|
||||
├── results_wanted (int): number of job results to retrieve for each site specified in 'site_type'
|
||||
├── easy_apply (bool): filters for jobs that are hosted on LinkedIn
|
||||
├── easy_apply (bool): filters for jobs that are hosted on the job board site
|
||||
├── country_indeed (enum): filters the country on Indeed (see below for correct spelling)
|
||||
├── offset (num): starts the search from an offset (e.g. 25 will start the search from the 25th result)
|
||||
```
|
||||
@@ -82,6 +80,7 @@ Optional
|
||||
JobPost
|
||||
├── title (str)
|
||||
├── company (str)
|
||||
├── company_url (str)
|
||||
├── job_url (str)
|
||||
├── location (object)
|
||||
│ ├── country (str)
|
||||
@@ -160,16 +159,11 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
|
||||
**Q: Received a response code 429?**
|
||||
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
|
||||
|
||||
- Waiting a few seconds between requests.
|
||||
- Waiting some time between scrapes (site-dependent).
|
||||
- Trying a VPN or proxy to change your IP address.
|
||||
|
||||
---
|
||||
|
||||
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
|
||||
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
|
||||
|
||||
- Upgrade to a newer version of MacOS
|
||||
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -2,12 +2,11 @@ from jobspy import scrape_jobs
|
||||
import pandas as pd
|
||||
|
||||
jobs: pd.DataFrame = scrape_jobs(
|
||||
site_name=["indeed", "linkedin", "zip_recruiter"],
|
||||
site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
|
||||
search_term="software engineer",
|
||||
location="Dallas, TX",
|
||||
results_wanted=50, # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho)
|
||||
results_wanted=25, # be wary the higher it is, the more likey you'll get blocked (rotating proxy can help tho)
|
||||
country_indeed="USA",
|
||||
offset=25 # start jobs from an offset (use if search failed and want to continue)
|
||||
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||
)
|
||||
|
||||
@@ -28,4 +27,4 @@ print("outputted to jobs.csv")
|
||||
# jobs.to_xlsx('jobs.xlsx', index=False)
|
||||
|
||||
# 4: display in Jupyter Notebook (1. pip install jupyter 2. jupyter notebook)
|
||||
# display(jobs)
|
||||
# display(jobs)
|
||||
77
examples/JobSpy_LongScrape.py
Normal file
77
examples/JobSpy_LongScrape.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from jobspy import scrape_jobs
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
|
||||
# creates csv a new filename if the jobs.csv already exists.
|
||||
csv_filename = "jobs.csv"
|
||||
counter = 1
|
||||
while os.path.exists(csv_filename):
|
||||
csv_filename = f"jobs_{counter}.csv"
|
||||
counter += 1
|
||||
|
||||
# results wanted and offset
|
||||
results_wanted = 1000
|
||||
offset = 0
|
||||
|
||||
all_jobs = []
|
||||
|
||||
# max retries
|
||||
max_retries = 3
|
||||
|
||||
# nuumber of results at each iteration
|
||||
results_in_each_iteration = 30
|
||||
|
||||
while len(all_jobs) < results_wanted:
|
||||
retry_count = 0
|
||||
while retry_count < max_retries:
|
||||
print("Doing from", offset, "to", offset + results_in_each_iteration, "jobs")
|
||||
try:
|
||||
jobs = scrape_jobs(
|
||||
site_name=["indeed"],
|
||||
search_term="software engineer",
|
||||
# New York, NY
|
||||
# Dallas, TX
|
||||
|
||||
# Los Angeles, CA
|
||||
location="Los Angeles, CA",
|
||||
results_wanted=min(results_in_each_iteration, results_wanted - len(all_jobs)),
|
||||
country_indeed="USA",
|
||||
offset=offset,
|
||||
# proxy="http://jobspy:5a4vpWtj8EeJ2hoYzk@ca.smartproxy.com:20001",
|
||||
)
|
||||
|
||||
# Add the scraped jobs to the list
|
||||
all_jobs.extend(jobs.to_dict('records'))
|
||||
|
||||
# Increment the offset for the next page of results
|
||||
offset += results_in_each_iteration
|
||||
|
||||
# Add a delay to avoid rate limiting (you can adjust the delay time as needed)
|
||||
print(f"Scraped {len(all_jobs)} jobs")
|
||||
print("Sleeping secs", 100 * (retry_count + 1))
|
||||
time.sleep(100 * (retry_count + 1)) # Sleep for 2 seconds between requests
|
||||
|
||||
break # Break out of the retry loop if successful
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
retry_count += 1
|
||||
print("Sleeping secs before retry", 100 * (retry_count + 1))
|
||||
time.sleep(100 * (retry_count + 1))
|
||||
if retry_count >= max_retries:
|
||||
print("Max retries reached. Exiting.")
|
||||
break
|
||||
|
||||
# DataFrame from the collected job data
|
||||
jobs_df = pd.DataFrame(all_jobs)
|
||||
|
||||
# Formatting
|
||||
pd.set_option("display.max_columns", None)
|
||||
pd.set_option("display.max_rows", None)
|
||||
pd.set_option("display.width", None)
|
||||
pd.set_option("display.max_colwidth", 50)
|
||||
|
||||
print(jobs_df)
|
||||
|
||||
jobs_df.to_csv(csv_filename, index=False)
|
||||
print(f"Outputted to {csv_filename}")
|
||||
18
poetry.lock
generated
18
poetry.lock
generated
@@ -1053,16 +1053,6 @@ files = [
|
||||
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
|
||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
|
||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
|
||||
{file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
|
||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
|
||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
|
||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
|
||||
@@ -2270,13 +2260,13 @@ test = ["flake8", "isort", "pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "tls-client"
|
||||
version = "0.2.1"
|
||||
version = "1.0"
|
||||
description = "Advanced Python HTTP Client."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "tls_client-0.2.1-py3-none-any.whl", hash = "sha256:124a710952b979d5e20b4e2b7879b7958d6e48a259d0f5b83101055eb173f0bd"},
|
||||
{file = "tls_client-0.2.1.tar.gz", hash = "sha256:473fb4c671d9d4ca6b818548ab6e955640dd589767bfce520830c5618c2f2e2b"},
|
||||
{file = "tls_client-1.0-py3-none-any.whl", hash = "sha256:f1183f5e18cb31914bd62d11b350a33ea0293ea80fb91d69a3072821dece3e66"},
|
||||
{file = "tls_client-1.0.tar.gz", hash = "sha256:7f6de48ad4a0ef69b72682c76ce604155971e07b4bfb2148a36276194ae3e7a0"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2445,4 +2435,4 @@ files = [
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "f966f3979873eec2c3b13460067f5aa414c69aa8ab5cd3239c1cfa564fcb5deb"
|
||||
content-hash = "404a77d78066cbb2ef71015562baf44aa11d12aac29a191c1ccc7758bfda598a"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "python-jobspy"
|
||||
version = "1.1.28"
|
||||
version = "1.1.40"
|
||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||
homepage = "https://github.com/Bunsly/JobSpy"
|
||||
@@ -13,7 +13,7 @@ packages = [
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
requests = "^2.31.0"
|
||||
tls-client = "^0.2.1"
|
||||
tls-client = "*"
|
||||
beautifulsoup4 = "^4.12.2"
|
||||
pandas = "^2.1.0"
|
||||
NUMPY = "1.24.2"
|
||||
|
||||
@@ -40,6 +40,7 @@ def scrape_jobs(
|
||||
country_indeed: str = "usa",
|
||||
hyperlinks: bool = False,
|
||||
proxy: Optional[str] = None,
|
||||
full_description: Optional[bool] = False,
|
||||
offset: Optional[int] = 0,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
@@ -74,6 +75,7 @@ def scrape_jobs(
|
||||
is_remote=is_remote,
|
||||
job_type=job_type,
|
||||
easy_apply=easy_apply,
|
||||
full_description=full_description,
|
||||
results_wanted=results_wanted,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import Union, Optional
|
||||
from typing import Optional
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, validator
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class JobType(Enum):
|
||||
@@ -55,18 +55,24 @@ class JobType(Enum):
|
||||
|
||||
|
||||
class Country(Enum):
|
||||
ARGENTINA = ("argentina", "com.ar")
|
||||
"""
|
||||
Gets the subdomain for Indeed and Glassdoor.
|
||||
The second item in the tuple is the subdomain for Indeed
|
||||
The third item in the tuple is the subdomain (and tld if there's a ':' separator) for Glassdoor
|
||||
"""
|
||||
|
||||
ARGENTINA = ("argentina", "ar", "com.ar")
|
||||
AUSTRALIA = ("australia", "au", "com.au")
|
||||
AUSTRIA = ("austria", "at", "at")
|
||||
BAHRAIN = ("bahrain", "bh")
|
||||
BELGIUM = ("belgium", "be", "nl:be")
|
||||
BELGIUM = ("belgium", "be", "fr:be")
|
||||
BRAZIL = ("brazil", "br", "com.br")
|
||||
CANADA = ("canada", "ca", "ca")
|
||||
CHILE = ("chile", "cl")
|
||||
CHINA = ("china", "cn")
|
||||
COLOMBIA = ("colombia", "co")
|
||||
COSTARICA = ("costa rica", "cr")
|
||||
CZECHREPUBLIC = ("czech republic", "cz")
|
||||
CZECHREPUBLIC = ("czech republic,czechia", "cz")
|
||||
DENMARK = ("denmark", "dk")
|
||||
ECUADOR = ("ecuador", "ec")
|
||||
EGYPT = ("egypt", "eg")
|
||||
@@ -112,8 +118,8 @@ class Country(Enum):
|
||||
TURKEY = ("turkey", "tr")
|
||||
UKRAINE = ("ukraine", "ua")
|
||||
UNITEDARABEMIRATES = ("united arab emirates", "ae")
|
||||
UK = ("uk", "uk", "co.uk")
|
||||
USA = ("usa", "www", "com")
|
||||
UK = ("uk,united kingdom", "uk", "co.uk")
|
||||
USA = ("usa,us,united states", "www", "com")
|
||||
URUGUAY = ("uruguay", "uy")
|
||||
VENEZUELA = ("venezuela", "ve")
|
||||
VIETNAM = ("vietnam", "vn")
|
||||
@@ -121,7 +127,7 @@ class Country(Enum):
|
||||
# internal for ziprecruiter
|
||||
US_CANADA = ("usa/ca", "www")
|
||||
|
||||
# internal for linkeind
|
||||
# internal for linkedin
|
||||
WORLDWIDE = ("worldwide", "www")
|
||||
|
||||
@property
|
||||
@@ -147,7 +153,8 @@ class Country(Enum):
|
||||
"""Convert a string to the corresponding Country enum."""
|
||||
country_str = country_str.strip().lower()
|
||||
for country in cls:
|
||||
if country.value[0] == country_str:
|
||||
country_names = country.value[0].split(',')
|
||||
if country_str in country_names:
|
||||
return country
|
||||
valid_countries = [country.value for country in cls]
|
||||
raise ValueError(
|
||||
@@ -167,10 +174,13 @@ class Location(BaseModel):
|
||||
if self.state:
|
||||
location_parts.append(self.state)
|
||||
if self.country and self.country not in (Country.US_CANADA, Country.WORLDWIDE):
|
||||
if self.country.value[0] in ("usa", "uk"):
|
||||
location_parts.append(self.country.value[0].upper())
|
||||
country_name = self.country.value[0]
|
||||
if "," in country_name:
|
||||
country_name = country_name.split(",")[0]
|
||||
if country_name in ("usa", "uk"):
|
||||
location_parts.append(country_name.upper())
|
||||
else:
|
||||
location_parts.append(self.country.value[0].title())
|
||||
location_parts.append(country_name.title())
|
||||
return ", ".join(location_parts)
|
||||
|
||||
|
||||
@@ -181,6 +191,10 @@ class CompensationInterval(Enum):
|
||||
DAILY = "daily"
|
||||
HOURLY = "hourly"
|
||||
|
||||
@classmethod
|
||||
def get_interval(cls, pay_period):
|
||||
return cls[pay_period].value if pay_period in cls.__members__ else None
|
||||
|
||||
|
||||
class Compensation(BaseModel):
|
||||
interval: Optional[CompensationInterval] = None
|
||||
|
||||
@@ -19,6 +19,7 @@ class ScraperInput(BaseModel):
|
||||
is_remote: bool = False
|
||||
job_type: Optional[JobType] = None
|
||||
easy_apply: bool = None # linkedin
|
||||
full_description: bool = False
|
||||
offset: int = 0
|
||||
|
||||
results_wanted: int = 15
|
||||
|
||||
@@ -4,17 +4,17 @@ jobspy.scrapers.glassdoor
|
||||
|
||||
This module contains routines to scrape Glassdoor.
|
||||
"""
|
||||
import math
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
from datetime import datetime, date
|
||||
from typing import Optional, Tuple, Any
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Optional
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from ..utils import count_urgent_words, extract_emails_from_text
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import GlassdoorException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||
from ..utils import create_session, modify_and_get_description
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Compensation,
|
||||
@@ -22,7 +22,6 @@ from ...jobs import (
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
)
|
||||
|
||||
|
||||
@@ -31,7 +30,7 @@ class GlassdoorScraper(Scraper):
|
||||
"""
|
||||
Initializes GlassdoorScraper with the Glassdoor job search url
|
||||
"""
|
||||
site = Site(Site.ZIP_RECRUITER)
|
||||
site = Site(Site.GLASSDOOR)
|
||||
super().__init__(site, proxy=proxy)
|
||||
|
||||
self.url = None
|
||||
@@ -49,15 +48,12 @@ class GlassdoorScraper(Scraper):
|
||||
) -> (list[JobPost], str | None):
|
||||
"""
|
||||
Scrapes a page of Glassdoor for jobs with scraper_input criteria
|
||||
:param scraper_input:
|
||||
:return: jobs found on page
|
||||
:return: cursor for next page
|
||||
"""
|
||||
try:
|
||||
payload = self.add_payload(
|
||||
scraper_input, location_id, location_type, page_num, cursor
|
||||
)
|
||||
session = create_session(self.proxy, is_tls=False)
|
||||
session = create_session(self.proxy, is_tls=False, has_retry=True)
|
||||
response = session.post(
|
||||
f"{self.url}/graph", headers=self.headers(), timeout=10, data=payload
|
||||
)
|
||||
@@ -74,48 +70,72 @@ class GlassdoorScraper(Scraper):
|
||||
jobs_data = res_json["data"]["jobListings"]["jobListings"]
|
||||
|
||||
jobs = []
|
||||
for i, job in enumerate(jobs_data):
|
||||
job_url = res_json["data"]["jobListings"]["jobListingSeoLinks"][
|
||||
"linkItems"
|
||||
][i]["url"]
|
||||
if job_url in self.seen_urls:
|
||||
continue
|
||||
self.seen_urls.add(job_url)
|
||||
job = job["jobview"]
|
||||
title = job["job"]["jobTitleText"]
|
||||
company_name = job["header"]["employerNameFromSearch"]
|
||||
location_name = job["header"].get("locationName", "")
|
||||
location_type = job["header"].get("locationType", "")
|
||||
is_remote = False
|
||||
location = None
|
||||
|
||||
if location_type == "S":
|
||||
is_remote = True
|
||||
else:
|
||||
location = self.parse_location(location_name)
|
||||
|
||||
compensation = self.parse_compensation(job["header"])
|
||||
|
||||
job = JobPost(
|
||||
title=title,
|
||||
company_name=company_name,
|
||||
job_url=job_url,
|
||||
location=location,
|
||||
compensation=compensation,
|
||||
is_remote=is_remote,
|
||||
)
|
||||
jobs.append(job)
|
||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||
future_to_job_data = {executor.submit(self.process_job, job): job for job in jobs_data}
|
||||
for future in as_completed(future_to_job_data):
|
||||
job_data = future_to_job_data[future]
|
||||
try:
|
||||
job_post = future.result()
|
||||
if job_post:
|
||||
jobs.append(job_post)
|
||||
except Exception as exc:
|
||||
raise GlassdoorException(f'Glassdoor generated an exception: {exc}')
|
||||
|
||||
return jobs, self.get_cursor_for_page(
|
||||
res_json["data"]["jobListings"]["paginationCursors"], page_num + 1
|
||||
)
|
||||
|
||||
def process_job(self, job_data):
|
||||
"""Processes a single job and fetches its description."""
|
||||
job_id = job_data["jobview"]["job"]["listingId"]
|
||||
job_url = f'{self.url}job-listing/j?jl={job_id}'
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
self.seen_urls.add(job_url)
|
||||
job = job_data["jobview"]
|
||||
title = job["job"]["jobTitleText"]
|
||||
company_name = job["header"]["employerNameFromSearch"]
|
||||
company_id = job_data['jobview']['header']['employer']['id']
|
||||
location_name = job["header"].get("locationName", "")
|
||||
location_type = job["header"].get("locationType", "")
|
||||
age_in_days = job["header"].get("ageInDays")
|
||||
is_remote, location = False, None
|
||||
date_posted = (datetime.now() - timedelta(days=age_in_days)).date() if age_in_days else None
|
||||
|
||||
if location_type == "S":
|
||||
is_remote = True
|
||||
else:
|
||||
location = self.parse_location(location_name)
|
||||
|
||||
compensation = self.parse_compensation(job["header"])
|
||||
|
||||
try:
|
||||
description = self.fetch_job_description(job_id)
|
||||
except Exception as e :
|
||||
description = None
|
||||
|
||||
job_post = JobPost(
|
||||
title=title,
|
||||
company_url=f"{self.url}Overview/W-EI_IE{company_id}.htm" if company_id else None,
|
||||
company_name=company_name,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
location=location,
|
||||
compensation=compensation,
|
||||
is_remote=is_remote,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
return job_post
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
"""
|
||||
Scrapes Glassdoor for jobs with scraper_input criteria.
|
||||
:param scraper_input: Information about job search criteria.
|
||||
:return: JobResponse containing a list of jobs.
|
||||
"""
|
||||
scraper_input.results_wanted = min(900, scraper_input.results_wanted)
|
||||
self.country = scraper_input.country
|
||||
self.url = self.country.get_url()
|
||||
|
||||
@@ -149,6 +169,41 @@ class GlassdoorScraper(Scraper):
|
||||
|
||||
return JobResponse(jobs=all_jobs)
|
||||
|
||||
def fetch_job_description(self, job_id):
|
||||
"""Fetches the job description for a single job ID."""
|
||||
url = f"{self.url}/graph"
|
||||
body = [
|
||||
{
|
||||
"operationName": "JobDetailQuery",
|
||||
"variables": {
|
||||
"jl": job_id,
|
||||
"queryString": "q",
|
||||
"pageTypeEnum": "SERP"
|
||||
},
|
||||
"query": """
|
||||
query JobDetailQuery($jl: Long!, $queryString: String, $pageTypeEnum: PageTypeEnum) {
|
||||
jobview: jobView(
|
||||
listingId: $jl
|
||||
contextHolder: {queryString: $queryString, pageTypeEnum: $pageTypeEnum}
|
||||
) {
|
||||
job {
|
||||
description
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
||||
"""
|
||||
}
|
||||
]
|
||||
response = requests.post(url, json=body, headers=GlassdoorScraper.headers())
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
data = response.json()[0]
|
||||
desc = data['data']['jobview']['job']['description']
|
||||
soup = BeautifulSoup(desc, 'html.parser')
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||
pay_period = data.get("payPeriod")
|
||||
@@ -161,15 +216,8 @@ class GlassdoorScraper(Scraper):
|
||||
interval = None
|
||||
if pay_period == "ANNUAL":
|
||||
interval = CompensationInterval.YEARLY
|
||||
elif pay_period == "MONTHLY":
|
||||
interval = CompensationInterval.MONTHLY
|
||||
elif pay_period == "WEEKLY":
|
||||
interval = CompensationInterval.WEEKLY
|
||||
elif pay_period == "DAILY":
|
||||
interval = CompensationInterval.DAILY
|
||||
elif pay_period == "HOURLY":
|
||||
interval = CompensationInterval.HOURLY
|
||||
|
||||
elif pay_period:
|
||||
interval = CompensationInterval.get_interval(pay_period)
|
||||
min_amount = int(adjusted_pay.get("p10") // 1)
|
||||
max_amount = int(adjusted_pay.get("p90") // 1)
|
||||
|
||||
@@ -180,17 +228,11 @@ class GlassdoorScraper(Scraper):
|
||||
currency=currency,
|
||||
)
|
||||
|
||||
def get_job_type_enum(self, job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
return None
|
||||
|
||||
def get_location(self, location: str, is_remote: bool) -> (int, str):
|
||||
if not location or is_remote:
|
||||
return "11047", "STATE" # remote options
|
||||
url = f"{self.url}/findPopularLocationAjax.htm?maxLocationsToReturn=10&term={location}"
|
||||
session = create_session(self.proxy)
|
||||
session = create_session(self.proxy, has_retry=True)
|
||||
response = session.get(url)
|
||||
if response.status_code != 200:
|
||||
raise GlassdoorException(
|
||||
@@ -213,12 +255,12 @@ class GlassdoorScraper(Scraper):
|
||||
location_type: str,
|
||||
page_num: int,
|
||||
cursor: str | None = None,
|
||||
) -> dict[str, str | Any]:
|
||||
) -> str:
|
||||
payload = {
|
||||
"operationName": "JobSearchResultsQuery",
|
||||
"variables": {
|
||||
"excludeJobListingIds": [],
|
||||
"filterParams": [],
|
||||
"filterParams": [{"filterKey": "applicationType", "values": "1"}] if scraper_input.easy_apply else [],
|
||||
"keyword": scraper_input.search_term,
|
||||
"numJobsToShow": 30,
|
||||
"locationType": location_type,
|
||||
@@ -243,12 +285,18 @@ class GlassdoorScraper(Scraper):
|
||||
payload["variables"]["filterParams"].append(
|
||||
{"filterKey": "jobType", "values": filter_value}
|
||||
)
|
||||
|
||||
return json.dumps([payload])
|
||||
|
||||
def parse_location(self, location_name: str) -> Location:
|
||||
@staticmethod
|
||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
if job_type_str in job_type.value:
|
||||
return [job_type]
|
||||
|
||||
@staticmethod
|
||||
def parse_location(location_name: str) -> Location | None:
|
||||
if not location_name or location_name == "Remote":
|
||||
return None
|
||||
return
|
||||
city, _, state = location_name.partition(", ")
|
||||
return Location(city=city, state=state)
|
||||
|
||||
@@ -257,7 +305,6 @@ class GlassdoorScraper(Scraper):
|
||||
for cursor_data in pagination_cursors:
|
||||
if cursor_data["pageNumber"] == page_num:
|
||||
return cursor_data["cursor"]
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def headers() -> dict:
|
||||
|
||||
@@ -8,6 +8,7 @@ import re
|
||||
import math
|
||||
import io
|
||||
import json
|
||||
from typing import Any
|
||||
from datetime import datetime
|
||||
|
||||
import urllib.parse
|
||||
@@ -21,6 +22,7 @@ from ..utils import (
|
||||
extract_emails_from_text,
|
||||
create_session,
|
||||
get_enum_from_job_type,
|
||||
modify_and_get_description
|
||||
)
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
@@ -43,7 +45,7 @@ class IndeedScraper(Scraper):
|
||||
site = Site(Site.INDEED)
|
||||
super().__init__(site, proxy=proxy)
|
||||
|
||||
self.jobs_per_page = 15
|
||||
self.jobs_per_page = 25
|
||||
self.seen_urls = set()
|
||||
|
||||
def scrape_page(
|
||||
@@ -59,29 +61,12 @@ class IndeedScraper(Scraper):
|
||||
domain = self.country.indeed_domain_value
|
||||
self.url = f"https://{domain}.indeed.com"
|
||||
|
||||
params = {
|
||||
"q": scraper_input.search_term,
|
||||
"l": scraper_input.location,
|
||||
"filter": 0,
|
||||
"start": scraper_input.offset + page * 10,
|
||||
}
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
sc_values = []
|
||||
if scraper_input.is_remote:
|
||||
sc_values.append("attr(DSQF7)")
|
||||
if scraper_input.job_type:
|
||||
sc_values.append("jt({})".format(scraper_input.job_type.value))
|
||||
|
||||
if sc_values:
|
||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||
try:
|
||||
session = create_session(self.proxy, is_tls=True)
|
||||
session = create_session(self.proxy)
|
||||
response = session.get(
|
||||
f"{self.url}/jobs",
|
||||
f"{self.url}/m/jobs",
|
||||
headers=self.get_headers(),
|
||||
params=params,
|
||||
params=self.add_params(scraper_input, page),
|
||||
allow_redirects=True,
|
||||
timeout_seconds=10,
|
||||
)
|
||||
@@ -110,8 +95,8 @@ class IndeedScraper(Scraper):
|
||||
):
|
||||
raise IndeedException("No jobs found.")
|
||||
|
||||
def process_job(job) -> JobPost | None:
|
||||
job_url = f'{self.url}/jobs/viewjob?jk={job["jobkey"]}'
|
||||
def process_job(job: dict) -> JobPost | None:
|
||||
job_url = f'{self.url}/m/jobs/viewjob?jk={job["jobkey"]}'
|
||||
job_url_client = f'{self.url}/viewjob?jk={job["jobkey"]}'
|
||||
if job_url in self.seen_urls:
|
||||
return None
|
||||
@@ -139,7 +124,8 @@ class IndeedScraper(Scraper):
|
||||
date_posted = datetime.fromtimestamp(timestamp_seconds)
|
||||
date_posted = date_posted.strftime("%Y-%m-%d")
|
||||
|
||||
description = self.get_description(job_url)
|
||||
description = self.get_description(job_url) if scraper_input.full_description else None
|
||||
|
||||
with io.StringIO(job["snippet"]) as f:
|
||||
soup_io = BeautifulSoup(f, "html.parser")
|
||||
li_elements = soup_io.find_all("li")
|
||||
@@ -150,6 +136,7 @@ class IndeedScraper(Scraper):
|
||||
title=job["normTitle"],
|
||||
description=description,
|
||||
company_name=job["company"],
|
||||
company_url=self.url + job["companyOverviewLink"] if "companyOverviewLink" in job else None,
|
||||
location=Location(
|
||||
city=job.get("jobLocationCity"),
|
||||
state=job.get("jobLocationState"),
|
||||
@@ -167,8 +154,9 @@ class IndeedScraper(Scraper):
|
||||
)
|
||||
return job_post
|
||||
|
||||
workers = 10 if scraper_input.full_description else 10 # possibly lessen 10 when fetching desc based on feedback
|
||||
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
job_results: list[Future] = [
|
||||
executor.submit(process_job, job) for job in jobs
|
||||
]
|
||||
@@ -190,7 +178,7 @@ class IndeedScraper(Scraper):
|
||||
#: get first page to initialize session
|
||||
job_list, total_results = self.scrape_page(scraper_input, 0)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures: list[Future] = [
|
||||
executor.submit(self.scrape_page, scraper_input, page)
|
||||
for page in range(1, pages_to_process + 1)
|
||||
@@ -219,7 +207,7 @@ class IndeedScraper(Scraper):
|
||||
parsed_url = urllib.parse.urlparse(job_page_url)
|
||||
params = urllib.parse.parse_qs(parsed_url.query)
|
||||
jk_value = params.get("jk", [None])[0]
|
||||
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
|
||||
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
|
||||
session = create_session(self.proxy)
|
||||
|
||||
try:
|
||||
@@ -235,33 +223,24 @@ class IndeedScraper(Scraper):
|
||||
if response.status_code not in range(200, 400):
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
script_tag = soup.find(
|
||||
"script", text=lambda x: x and "window._initialData" in x
|
||||
)
|
||||
|
||||
if not script_tag:
|
||||
return None
|
||||
|
||||
script_code = script_tag.string
|
||||
match = re.search(r"window\._initialData\s*=\s*({.*?})\s*;", script_code, re.S)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
json_string = match.group(1)
|
||||
data = json.loads(json_string)
|
||||
try:
|
||||
job_description = data["jobInfoWrapperModel"]["jobInfoModel"][
|
||||
"sanitizedJobDescription"
|
||||
]
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
script_tags = soup.find_all('script')
|
||||
|
||||
job_description = ''
|
||||
for tag in script_tags:
|
||||
if 'window._initialData' in tag.text:
|
||||
json_str = tag.text
|
||||
json_str = json_str.split('window._initialData=')[1]
|
||||
json_str = json_str.rsplit(';', 1)[0]
|
||||
data = json.loads(json_str)
|
||||
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
|
||||
break
|
||||
except (KeyError, TypeError, IndexError):
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(job_description, "html.parser")
|
||||
text_content = " ".join(soup.get_text(separator=" ").split()).strip()
|
||||
|
||||
return text_content
|
||||
return modify_and_get_description(soup)
|
||||
|
||||
@staticmethod
|
||||
def get_job_type(job: dict) -> list[JobType] | None:
|
||||
@@ -320,7 +299,7 @@ class IndeedScraper(Scraper):
|
||||
raise IndeedException("Could not find mosaic provider job cards data")
|
||||
else:
|
||||
raise IndeedException(
|
||||
"Could not find a script tag containing mosaic provider data"
|
||||
"Could not find any results for the search"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -344,17 +323,14 @@ class IndeedScraper(Scraper):
|
||||
@staticmethod
|
||||
def get_headers():
|
||||
return {
|
||||
"authority": "www.indeed.com",
|
||||
"accept": "*/*",
|
||||
"accept-language": "en-US,en;q=0.9",
|
||||
"referer": "https://www.indeed.com/viewjob?jk=fe6182337d72c7b1&tk=1hcbfcmd0k62t802&from=serp&vjs=3&advn=8132938064490989&adid=408692607&ad=-6NYlbfkN0A3Osc99MJFDKjquSk4WOGT28ALb_ad4QMtrHreCb9ICg6MiSVy9oDAp3evvOrI7Q-O9qOtQTg1EPbthP9xWtBN2cOuVeHQijxHjHpJC65TjDtftH3AXeINjBvAyDrE8DrRaAXl8LD3Fs1e_xuDHQIssdZ2Mlzcav8m5jHrA0fA64ZaqJV77myldaNlM7-qyQpy4AsJQfvg9iR2MY7qeC5_FnjIgjKIy_lNi9OPMOjGRWXA94CuvC7zC6WeiJmBQCHISl8IOBxf7EdJZlYdtzgae3593TFxbkd6LUwbijAfjax39aAuuCXy3s9C4YgcEP3TwEFGQoTpYu9Pmle-Ae1tHGPgsjxwXkgMm7Cz5mBBdJioglRCj9pssn-1u1blHZM4uL1nK9p1Y6HoFgPUU9xvKQTHjKGdH8d4y4ETyCMoNF4hAIyUaysCKdJKitC8PXoYaWhDqFtSMR4Jys8UPqUV&xkcb=SoDD-_M3JLQfWnQTDh0LbzkdCdPP&xpse=SoBa6_I3JLW9FlWZlB0PbzkdCdPP&sjdu=i6xVERweJM_pVUvgf-MzuaunBTY7G71J5eEX6t4DrDs5EMPQdODrX7Nn-WIPMezoqr5wA_l7Of-3CtoiUawcHw",
|
||||
"sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"Windows"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
'Host': 'www.indeed.com',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-dest': 'document',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Indeed App 192.0',
|
||||
'referer': 'https://www.indeed.com/m/jobs?q=software%20intern&l=Dallas%2C%20TX&from=serpso&rq=1&rsIdx=3',
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -367,3 +343,29 @@ class IndeedScraper(Scraper):
|
||||
if taxonomy["label"] == "remote" and len(taxonomy["attributes"]) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def add_params(scraper_input: ScraperInput, page: int) -> dict[str, str | Any]:
|
||||
params = {
|
||||
"q": scraper_input.search_term,
|
||||
"l": scraper_input.location,
|
||||
"filter": 0,
|
||||
"start": scraper_input.offset + page * 10,
|
||||
"sort": "date"
|
||||
}
|
||||
if scraper_input.distance:
|
||||
params["radius"] = scraper_input.distance
|
||||
|
||||
sc_values = []
|
||||
if scraper_input.is_remote:
|
||||
sc_values.append("attr(DSQF7)")
|
||||
if scraper_input.job_type:
|
||||
sc_values.append("jt({})".format(scraper_input.job_type.value))
|
||||
|
||||
if sc_values:
|
||||
params["sc"] = "0kf:" + "".join(sc_values) + ";"
|
||||
|
||||
if scraper_input.easy_apply:
|
||||
params['iafilter'] = 1
|
||||
|
||||
return params
|
||||
|
||||
@@ -4,26 +4,40 @@ jobspy.scrapers.linkedin
|
||||
|
||||
This module contains routines to scrape LinkedIn.
|
||||
"""
|
||||
import time
|
||||
import random
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
import time
|
||||
from requests.exceptions import ProxyError
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from threading import Lock
|
||||
from bs4.element import Tag
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
||||
from ..exceptions import LinkedInException
|
||||
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
||||
from ..utils import create_session
|
||||
from ...jobs import (
|
||||
JobPost,
|
||||
Location,
|
||||
JobResponse,
|
||||
JobType,
|
||||
Country,
|
||||
Compensation
|
||||
)
|
||||
from ..utils import (
|
||||
count_urgent_words,
|
||||
extract_emails_from_text,
|
||||
get_enum_from_job_type,
|
||||
currency_parser,
|
||||
modify_and_get_description
|
||||
)
|
||||
|
||||
|
||||
class LinkedInScraper(Scraper):
|
||||
MAX_RETRIES = 3
|
||||
DELAY = 10
|
||||
DELAY = 3
|
||||
|
||||
def __init__(self, proxy: Optional[str] = None):
|
||||
"""
|
||||
@@ -57,6 +71,7 @@ class LinkedInScraper(Scraper):
|
||||
return mapping.get(job_type_enum, "")
|
||||
|
||||
while len(job_list) < scraper_input.results_wanted and page < 1000:
|
||||
session = create_session(is_tls=False, has_retry=True, delay=5)
|
||||
params = {
|
||||
"keywords": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
@@ -71,44 +86,30 @@ class LinkedInScraper(Scraper):
|
||||
}
|
||||
|
||||
params = {k: v for k, v in params.items() if v is not None}
|
||||
retries = 0
|
||||
while retries < self.MAX_RETRIES:
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||
params=params,
|
||||
allow_redirects=True,
|
||||
proxies=self.proxy,
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
break
|
||||
except requests.HTTPError as e:
|
||||
if hasattr(e, "response") and e.response is not None:
|
||||
if e.response.status_code in (429, 502):
|
||||
time.sleep(self.DELAY)
|
||||
retries += 1
|
||||
continue
|
||||
else:
|
||||
raise LinkedInException(
|
||||
f"bad response status code: {e.response.status_code}"
|
||||
)
|
||||
else:
|
||||
raise
|
||||
except ProxyError as e:
|
||||
raise LinkedInException("bad proxy")
|
||||
except Exception as e:
|
||||
raise LinkedInException(str(e))
|
||||
else:
|
||||
# Raise an exception if the maximum number of retries is reached
|
||||
raise LinkedInException(
|
||||
"Max retries reached, failed to get a valid response"
|
||||
try:
|
||||
response = session.get(
|
||||
f"{self.url}/jobs-guest/jobs/api/seeMoreJobPostings/search?",
|
||||
params=params,
|
||||
allow_redirects=True,
|
||||
proxies=self.proxy,
|
||||
headers=self.headers(),
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.HTTPError as e:
|
||||
raise LinkedInException(f"bad response status code: {e.response.status_code}")
|
||||
except ProxyError as e:
|
||||
raise LinkedInException("bad proxy")
|
||||
except Exception as e:
|
||||
raise LinkedInException(str(e))
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
job_cards = soup.find_all("div", class_="base-search-card")
|
||||
if len(job_cards) == 0:
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
for job_card in soup.find_all("div", class_="base-search-card"):
|
||||
for job_card in job_cards:
|
||||
job_url = None
|
||||
href_tag = job_card.find("a", class_="base-card__full-link")
|
||||
if href_tag and "href" in href_tag.attrs:
|
||||
@@ -123,18 +124,19 @@ class LinkedInScraper(Scraper):
|
||||
|
||||
# Call process_job directly without threading
|
||||
try:
|
||||
job_post = self.process_job(job_card, job_url)
|
||||
job_post = self.process_job(job_card, job_url, scraper_input.full_description)
|
||||
if job_post:
|
||||
job_list.append(job_post)
|
||||
except Exception as e:
|
||||
raise LinkedInException("Exception occurred while processing jobs")
|
||||
|
||||
page += 25
|
||||
time.sleep(random.uniform(LinkedInScraper.DELAY, LinkedInScraper.DELAY + 2))
|
||||
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
def process_job(self, job_card: Tag, job_url: str) -> Optional[JobPost]:
|
||||
def process_job(self, job_card: Tag, job_url: str, full_descr: bool) -> Optional[JobPost]:
|
||||
salary_tag = job_card.find('span', class_='job-search-card__salary-info')
|
||||
|
||||
compensation = None
|
||||
@@ -171,7 +173,7 @@ class LinkedInScraper(Scraper):
|
||||
if metadata_card
|
||||
else None
|
||||
)
|
||||
date_posted = None
|
||||
date_posted = description = job_type = None
|
||||
if datetime_tag and "datetime" in datetime_tag.attrs:
|
||||
datetime_str = datetime_tag["datetime"]
|
||||
try:
|
||||
@@ -180,21 +182,20 @@ class LinkedInScraper(Scraper):
|
||||
date_posted = None
|
||||
benefits_tag = job_card.find("span", class_="result-benefits__text")
|
||||
benefits = " ".join(benefits_tag.get_text().split()) if benefits_tag else None
|
||||
|
||||
description, job_type = self.get_job_description(job_url)
|
||||
# description, job_type = None, []
|
||||
if full_descr:
|
||||
description, job_type = self.get_job_description(job_url)
|
||||
|
||||
return JobPost(
|
||||
title=title,
|
||||
description=description,
|
||||
company_name=company,
|
||||
company_url=company_url,
|
||||
location=location,
|
||||
date_posted=date_posted,
|
||||
job_url=job_url,
|
||||
job_type=job_type,
|
||||
compensation=compensation,
|
||||
benefits=benefits,
|
||||
job_type=job_type,
|
||||
description=description,
|
||||
emails=extract_emails_from_text(description) if description else None,
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
@@ -208,12 +209,10 @@ class LinkedInScraper(Scraper):
|
||||
:return: description or None
|
||||
"""
|
||||
try:
|
||||
response = requests.get(job_page_url, timeout=5, proxies=self.proxy)
|
||||
session = create_session(is_tls=False, has_retry=True)
|
||||
response = session.get(job_page_url, timeout=5, proxies=self.proxy)
|
||||
response.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
if hasattr(e, "response") and e.response is not None:
|
||||
if e.response.status_code in (429, 502):
|
||||
time.sleep(self.DELAY)
|
||||
return None, None
|
||||
except Exception as e:
|
||||
return None, None
|
||||
@@ -227,7 +226,7 @@ class LinkedInScraper(Scraper):
|
||||
|
||||
description = None
|
||||
if div_content:
|
||||
description = " ".join(div_content.get_text().split()).strip()
|
||||
description = modify_and_get_description(div_content)
|
||||
|
||||
def get_job_type(
|
||||
soup_job_type: BeautifulSoup,
|
||||
@@ -287,3 +286,21 @@ class LinkedInScraper(Scraper):
|
||||
)
|
||||
|
||||
return location
|
||||
|
||||
@staticmethod
|
||||
def headers() -> dict:
|
||||
return {
|
||||
'authority': 'www.linkedin.com',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'accept-language': 'en-US,en;q=0.9',
|
||||
'cache-control': 'max-age=0',
|
||||
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||
# 'sec-ch-ua-mobile': '?0',
|
||||
# 'sec-ch-ua-platform': '"macOS"',
|
||||
# 'sec-fetch-dest': 'document',
|
||||
# 'sec-fetch-mode': 'navigate',
|
||||
# 'sec-fetch-site': 'none',
|
||||
# 'sec-fetch-user': '?1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
@@ -1,11 +1,22 @@
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
import requests
|
||||
import tls_client
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
from ..jobs import JobType
|
||||
|
||||
|
||||
def modify_and_get_description(soup):
|
||||
for li in soup.find_all('li'):
|
||||
li.string = "- " + li.get_text()
|
||||
|
||||
description = soup.get_text(separator='\n').strip()
|
||||
description = re.sub(r'\n+', '\n', description)
|
||||
return description
|
||||
|
||||
|
||||
def count_urgent_words(description: str) -> int:
|
||||
"""
|
||||
Count the number of urgent words or phrases in a job description.
|
||||
@@ -27,11 +38,11 @@ def extract_emails_from_text(text: str) -> list[str] | None:
|
||||
return email_regex.findall(text)
|
||||
|
||||
|
||||
def create_session(proxy: dict | None = None, is_tls: bool = True):
|
||||
def create_session(proxy: dict | None = None, is_tls: bool = True, has_retry: bool = False, delay: int = 1) -> requests.Session:
|
||||
"""
|
||||
Creates a tls client session
|
||||
Creates a requests session with optional tls, proxy, and retry settings.
|
||||
|
||||
:return: A session object with or without proxies.
|
||||
:return: A session object
|
||||
"""
|
||||
if is_tls:
|
||||
session = tls_client.Session(
|
||||
@@ -39,17 +50,21 @@ def create_session(proxy: dict | None = None, is_tls: bool = True):
|
||||
random_tls_extension_order=True,
|
||||
)
|
||||
session.proxies = proxy
|
||||
# TODO multiple proxies
|
||||
# if self.proxies:
|
||||
# session.proxies = {
|
||||
# "http": random.choice(self.proxies),
|
||||
# "https": random.choice(self.proxies),
|
||||
# }
|
||||
else:
|
||||
session = requests.Session()
|
||||
session.allow_redirects = True
|
||||
if proxy:
|
||||
session.proxies.update(proxy)
|
||||
if has_retry:
|
||||
retries = Retry(total=3,
|
||||
connect=3,
|
||||
status=3,
|
||||
status_forcelist=[500, 502, 503, 504, 429],
|
||||
backoff_factor=delay)
|
||||
adapter = HTTPAdapter(max_retries=retries)
|
||||
|
||||
session.mount('http://', adapter)
|
||||
session.mount('https://', adapter)
|
||||
|
||||
return session
|
||||
|
||||
|
||||
@@ -15,8 +15,8 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from .. import Scraper, ScraperInput, Site
|
||||
from ..exceptions import ZipRecruiterException
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
||||
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
|
||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
|
||||
|
||||
|
||||
class ZipRecruiterScraper(Scraper):
|
||||
@@ -26,6 +26,8 @@ class ZipRecruiterScraper(Scraper):
|
||||
"""
|
||||
site = Site(Site.ZIP_RECRUITER)
|
||||
self.url = "https://www.ziprecruiter.com"
|
||||
self.session = create_session(proxy)
|
||||
self.get_cookies()
|
||||
super().__init__(site, proxy=proxy)
|
||||
|
||||
self.jobs_per_page = 20
|
||||
@@ -42,14 +44,12 @@ class ZipRecruiterScraper(Scraper):
|
||||
"""
|
||||
params = self.add_params(scraper_input)
|
||||
if continue_token:
|
||||
params["continue"] = continue_token
|
||||
params["continue_from"] = continue_token
|
||||
try:
|
||||
session = create_session(self.proxy, is_tls=False)
|
||||
response = session.get(
|
||||
response = self.session.get(
|
||||
f"https://api.ziprecruiter.com/jobs-app/jobs",
|
||||
headers=self.headers(),
|
||||
params=self.add_params(scraper_input),
|
||||
timeout=10,
|
||||
params=params
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise ZipRecruiterException(
|
||||
@@ -68,7 +68,7 @@ class ZipRecruiterScraper(Scraper):
|
||||
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
|
||||
job_results = [executor.submit(self.process_job, job) for job in jobs_list]
|
||||
|
||||
job_list = [result.result() for result in job_results if result.result()]
|
||||
job_list = list(filter(None, (result.result() for result in job_results)))
|
||||
return job_list, next_continue_token
|
||||
|
||||
def scrape(self, scraper_input: ScraperInput) -> JobResponse:
|
||||
@@ -95,20 +95,19 @@ class ZipRecruiterScraper(Scraper):
|
||||
if not continue_token:
|
||||
break
|
||||
|
||||
if len(job_list) > scraper_input.results_wanted:
|
||||
job_list = job_list[: scraper_input.results_wanted]
|
||||
return JobResponse(jobs=job_list[: scraper_input.results_wanted])
|
||||
|
||||
return JobResponse(jobs=job_list)
|
||||
|
||||
@staticmethod
|
||||
def process_job(job: dict) -> JobPost:
|
||||
def process_job(self, job: dict) -> JobPost | None:
|
||||
"""Processes an individual job dict from the response"""
|
||||
title = job.get("name")
|
||||
job_url = job.get("job_url")
|
||||
job_url = f"https://www.ziprecruiter.com/jobs//j?lvk={job['listing_key']}"
|
||||
if job_url in self.seen_urls:
|
||||
return
|
||||
self.seen_urls.add(job_url)
|
||||
|
||||
description = BeautifulSoup(
|
||||
job.get("job_description", "").strip(), "html.parser"
|
||||
).get_text()
|
||||
job_description_html = job.get("job_description", "").strip()
|
||||
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||
description = modify_and_get_description(description_soup)
|
||||
|
||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||
@@ -156,6 +155,11 @@ class ZipRecruiterScraper(Scraper):
|
||||
num_urgent_words=count_urgent_words(description) if description else None,
|
||||
)
|
||||
|
||||
def get_cookies(self):
|
||||
url="https://api.ziprecruiter.com/jobs-app/event"
|
||||
data="event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
|
||||
self.session.post(url, data=data, headers=ZipRecruiterScraper.headers())
|
||||
|
||||
@staticmethod
|
||||
def get_job_type_enum(job_type_str: str) -> list[JobType] | None:
|
||||
for job_type in JobType:
|
||||
@@ -168,7 +172,6 @@ class ZipRecruiterScraper(Scraper):
|
||||
params = {
|
||||
"search": scraper_input.search_term,
|
||||
"location": scraper_input.location,
|
||||
"form": "jobs-landing",
|
||||
}
|
||||
job_type_value = None
|
||||
if scraper_input.job_type:
|
||||
@@ -178,6 +181,8 @@ class ZipRecruiterScraper(Scraper):
|
||||
job_type_value = "part_time"
|
||||
else:
|
||||
job_type_value = scraper_input.job_type.value
|
||||
if scraper_input.easy_apply:
|
||||
params['zipapply'] = 1
|
||||
|
||||
if job_type_value:
|
||||
params[
|
||||
@@ -200,7 +205,6 @@ class ZipRecruiterScraper(Scraper):
|
||||
"""
|
||||
return {
|
||||
"Host": "api.ziprecruiter.com",
|
||||
"Cookie": "ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38; SplitSV=2016-10-19%3AU2FsdGVkX19f9%2Bx70knxc%2FeR3xXR8lWoTcYfq5QjmLU%3D%0A; __cf_bm=qXim3DtLPbOL83GIp.ddQEOFVFTc1OBGPckiHYxcz3o-1698521532-0-AfUOCkgCZyVbiW1ziUwyefCfzNrJJTTKPYnif1FZGQkT60dMowmSU/Y/lP+WiygkFPW/KbYJmyc+MQSkkad5YygYaARflaRj51abnD+SyF9V; zglobalid=68d49bd5-0326-428e-aba8-8a04b64bc67c.af2d99ff7c03.653d61bb; ziprecruiter_browser=018188e0-045b-4ad7-aa50-627a6c3d43aa; ziprecruiter_session=5259b2219bf95b6d2299a1417424bc2edc9f4b38",
|
||||
"accept": "*/*",
|
||||
"x-zr-zva-override": "100000000;vid:ZT1huzm_EQlDTVEc",
|
||||
"x-pushnotificationid": "0ff4983d38d7fc5b3370297f2bcffcf4b3321c418f5c22dd152a0264707602a0",
|
||||
|
||||
Reference in New Issue
Block a user