Compare commits

..

3 Commits

Author SHA1 Message Date
Cullen Watson
ce3bd84ee5 fix: indeed parse description bug (#96)
* fix(indeed): full descr

* chore: version
2024-02-02 18:21:55 -06:00
Cullen Watson
1ccf2290fe docs: readme 2024-02-02 17:59:24 -06:00
Cullen Watson
ec2eefc58a docs: readme 2024-02-02 17:58:15 -06:00
3 changed files with 18 additions and 13 deletions

View File

@@ -80,6 +80,7 @@ Optional
JobPost
├── title (str)
├── company (str)
├── company_url (str)
├── job_url (str)
├── location (object)
│ ├── country (str)
@@ -158,16 +159,11 @@ persist, [submit an issue](https://github.com/Bunsly/JobSpy/issues).
**Q: Received a response code 429?**
**A:** This indicates that you have been blocked by the job board site for sending too many requests. All of the job board sites are aggressive with blocking. We recommend:
- Waiting a few seconds between requests.
- Waiting some time between scrapes (site-dependent).
- Trying a VPN or proxy to change your IP address.
---
**Q: Experiencing a "Segmentation fault: 11" on macOS Catalina?**
**A:** This is due to `tls_client` dependency not supporting your architecture. Solutions and workarounds include:
- Upgrade to a newer version of MacOS
- Reach out to the maintainers of [tls_client](https://github.com/bogdanfinn/tls-client) for fixes

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "python-jobspy"
version = "1.1.38"
version = "1.1.39"
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
homepage = "https://github.com/Bunsly/JobSpy"

View File

@@ -154,8 +154,9 @@ class IndeedScraper(Scraper):
)
return job_post
workers = 10 if scraper_input.full_description else 10 # possibly lessen 10 when fetching desc based on feedback
jobs = jobs["metaData"]["mosaicProviderJobCardsModel"]["results"]
with ThreadPoolExecutor(max_workers=1) as executor:
with ThreadPoolExecutor(max_workers=workers) as executor:
job_results: list[Future] = [
executor.submit(process_job, job) for job in jobs
]
@@ -206,7 +207,7 @@ class IndeedScraper(Scraper):
parsed_url = urllib.parse.urlparse(job_page_url)
params = urllib.parse.parse_qs(parsed_url.query)
jk_value = params.get("jk", [None])[0]
formatted_url = f"{self.url}/viewjob?jk={jk_value}&spa=1"
formatted_url = f"{self.url}/m/viewjob?jk={jk_value}&spa=1"
session = create_session(self.proxy)
try:
@@ -223,10 +224,18 @@ class IndeedScraper(Scraper):
return None
try:
data = json.loads(response.text)
job_description = data["body"]["jobInfoWrapperModel"]["jobInfoModel"][
"sanitizedJobDescription"
]
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')
job_description = ''
for tag in script_tags:
if 'window._initialData' in tag.text:
json_str = tag.text
json_str = json_str.split('window._initialData=')[1]
json_str = json_str.rsplit(';', 1)[0]
data = json.loads(json_str)
job_description = data["jobInfoWrapperModel"]["jobInfoModel"]["sanitizedJobDescription"]
break
except (KeyError, TypeError, IndexError):
return None