enh: ziprecruiter full description (#162 )

docs: readme
2026-03-05 03:54:31 -08:00 · 2024-06-09 16:21:01 -05:00 · 2024-05-29 19:32:32 -05:00 · 2024-05-28 16:04:26 -05:00 · 2024-05-28 16:01:29 -05:00 · 2024-05-28 15:39:24 -05:00
7 changed files with 83 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -13,9 +13,6 @@ work with us.*
 - Aggregates the job postings in a Pandas DataFrame
 - Proxies support
 [Video Guide for JobSpy](https://www.youtube.com/watch?v=RuP1HrAZnxs&pp=ygUgam9icyBzY3JhcGVyIGJvdCBsaW5rZWRpbiBpbmRlZWQ%3D) -
 Updated for release v1.1.3
 ![jobspy](https://github.com/cullenwatson/JobSpy/assets/78247585/ec7ef355-05f6-4fd3-8161-a817e31c5c57)
 ### Installation
@@ -41,12 +38,12 @@ jobs = scrape_jobs(
    country_indeed='USA',  # only needed for indeed / glassdoor
    # linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
-    # proxies=["Efb5EA8OIk0BQb:wifi;us;@proxy.soax.com:9000", "localhost"],
+    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
 )
 print(f"Found {len(jobs)} jobs")
 print(jobs.head())
-jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_xlsx
+jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel
 ```
 ### Output
@@ -79,7 +76,7 @@ Optional
 ├── job_type (str): 
 |    fulltime, parttime, internship, contract
 │
-├── proxies (): 
+├── proxies (list): 
 |    in format ['user:pass@host:port', 'localhost']
 |    each job board will round robin through the proxies
 │
@@ -143,13 +140,14 @@ JobPost
 │   ├── state (str)
 ├── description (str)
 ├── job_type (str): fulltime, parttime, internship, contract
 ├── job_function (str)
 ├── compensation (object)
 │   ├── interval (str): yearly, monthly, weekly, daily, hourly
 │   ├── min_amount (int)
 │   ├── max_amount (int)
 │   └── currency (enum)
-└── date_posted (date)
+├── date_posted (date)
-└── emails (str)
+├── emails (str)
 └── is_remote (bool)
 Indeed specific
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "python-jobspy"
-version = "1.1.54"
+version = "1.1.56"
 description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
 authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
 homepage = "https://github.com/Bunsly/JobSpy"
--- a/src/jobspy/init.py
+++ b/src/jobspy/init.py
@@ -182,6 +182,7 @@ def scrape_jobs(
            "max_amount",
            "currency",
            "is_remote",
            "job_function",
            "emails",
            "description",
            "company_url",
--- a/src/jobspy/jobs/init.py
+++ b/src/jobspy/jobs/init.py
@@ -254,6 +254,9 @@ class JobPost(BaseModel):
    logo_photo_url: str | None = None
    banner_photo_url: str | None = None
    # linkedin only atm
    job_function: str | None = None
 class JobResponse(BaseModel):
    jobs: list[JobPost] = []
--- a/src/jobspy/scrapers/linkedin/init.py
+++ b/src/jobspy/scrapers/linkedin/init.py
@@ -13,14 +13,13 @@ import regex as re
 from typing import Optional
 from datetime import datetime
 from threading import Lock
 from bs4.element import Tag
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse, unquote
 from .. import Scraper, ScraperInput, Site
 from ..exceptions import LinkedInException
-from ..utils import create_session
+from ..utils import create_session, remove_attributes
 from ...jobs import (
    JobPost,
    Location,
@@ -71,8 +70,8 @@ class LinkedInScraper(Scraper):
        self.scraper_input = scraper_input
        job_list: list[JobPost] = []
        seen_urls = set()
-        url_lock = Lock()
+        page = scraper_input.offset // 10 * 10 if scraper_input.offset else 0
-        page = scraper_input.offset // 25 + 25 if scraper_input.offset else 0
+        request_count = 0
        seconds_old = (
            scraper_input.hours_old * 3600 if scraper_input.hours_old else None
        )
@@ -80,7 +79,8 @@ class LinkedInScraper(Scraper):
            lambda: len(job_list) < scraper_input.results_wanted and page < 1000
        )
        while continue_search():
-            logger.info(f"LinkedIn search page: {page // 25 + 1}")
+            request_count += 1
            logger.info(f"LinkedIn search page: {request_count}")
            params = {
                "keywords": scraper_input.search_term,
                "location": scraper_input.location,
@@ -92,7 +92,7 @@ class LinkedInScraper(Scraper):
                    else None
                ),
                "pageNum": 0,
-                "start": page + scraper_input.offset,
+                "start": page,
                "f_AL": "true" if scraper_input.easy_apply else None,
                "f_C": (
                    ",".join(map(str, scraper_input.linkedin_company_ids))
@@ -140,10 +140,9 @@ class LinkedInScraper(Scraper):
                    job_id = href.split("-")[-1]
                    job_url = f"{self.base_url}/jobs/view/{job_id}"
-                with url_lock:
+                if job_url in seen_urls:
-                    if job_url in seen_urls:
+                    continue
-                        continue
+                seen_urls.add(job_url)
                    seen_urls.add(job_url)
                try:
                    fetch_desc = scraper_input.linkedin_fetch_description
                    job_post = self._process_job(job_card, job_url, fetch_desc)
@@ -156,7 +155,7 @@ class LinkedInScraper(Scraper):
            if continue_search():
                time.sleep(random.uniform(self.delay, self.delay + self.band_delay))
-                page += self.jobs_per_page
+                page += len(job_list)
        job_list = job_list[: scraper_input.results_wanted]
        return JobResponse(jobs=job_list)
@@ -225,6 +224,7 @@ class LinkedInScraper(Scraper):
            job_url_direct=job_details.get("job_url_direct"),
            emails=extract_emails_from_text(job_details.get("description")),
            logo_photo_url=job_details.get("logo_photo_url"),
            job_function=job_details.get("job_function"),
        )
    def _get_id(self, url: str):
@@ -248,7 +248,7 @@ class LinkedInScraper(Scraper):
            response.raise_for_status()
        except:
            return {}
-        if response.url == "https://www.linkedin.com/signup":
+        if "linkedin.com/signup" in response.url:
            return {}
        soup = BeautifulSoup(response.text, "html.parser")
@@ -257,16 +257,22 @@ class LinkedInScraper(Scraper):
        )
        description = None
        if div_content is not None:
            def remove_attributes(tag):
                for attr in list(tag.attrs):
                    del tag[attr]
                return tag
            div_content = remove_attributes(div_content)
            description = div_content.prettify(formatter="html")
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description = markdown_converter(description)
        h3_tag = soup.find(
            "h3", text=lambda text: text and "Job function" in text.strip()
        )
        job_function = None
        if h3_tag:
            job_function_span = h3_tag.find_next(
                "span", class_="description__job-criteria-text"
            )
            if job_function_span:
                job_function = job_function_span.text.strip()
        return {
            "description": description,
            "job_type": self._parse_job_type(soup),
@@ -274,6 +280,7 @@ class LinkedInScraper(Scraper):
            "logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
                "data-delayed-url"
            ),
            "job_function": job_function,
        }
    def _get_location(self, metadata_card: Optional[Tag]) -> Location:
--- a/src/jobspy/scrapers/utils.py
+++ b/src/jobspy/scrapers/utils.py
@@ -93,6 +93,7 @@ class TLSRotating(RotatingProxySession, tls_client.Session):
            else:
                self.proxies = {}
        response = tls_client.Session.execute_request(self, *args, **kwargs)
        response.ok = response.status_code in range(200, 400)
        return response
@@ -178,3 +179,9 @@ def currency_parser(cur_str):
        num = float(cur_str)
    return np.round(num, 2)
 def remove_attributes(tag):
    for attr in list(tag.attrs):
        del tag[attr]
    return tag
--- a/src/jobspy/scrapers/ziprecruiter/init.py
+++ b/src/jobspy/scrapers/ziprecruiter/init.py
@@ -7,19 +7,24 @@ This module contains routines to scrape ZipRecruiter.
 from __future__ import annotations
 import json
 import math
 import re
 import time
 from datetime import datetime
 from typing import Optional, Tuple, Any
 from concurrent.futures import ThreadPoolExecutor
 from bs4 import BeautifulSoup
 from .. import Scraper, ScraperInput, Site
 from ..utils import (
    logger,
    extract_emails_from_text,
    create_session,
    markdown_converter,
    remove_attributes,
 )
 from ...jobs import (
    JobPost,
@@ -151,6 +156,8 @@ class ZipRecruiterScraper(Scraper):
        comp_min = int(job["compensation_min"]) if "compensation_min" in job else None
        comp_max = int(job["compensation_max"]) if "compensation_max" in job else None
        comp_currency = job.get("compensation_currency")
        description_full, job_url_direct = self._get_descr(job_url)
        return JobPost(
            id=str(job["listing_key"]),
            title=title,
@@ -165,10 +172,42 @@ class ZipRecruiterScraper(Scraper):
            ),
            date_posted=date_posted,
            job_url=job_url,
-            description=description,
+            description=description_full if description_full else description,
            emails=extract_emails_from_text(description) if description else None,
            job_url_direct=job_url_direct,
        )
    def _get_descr(self, job_url):
        res = self.session.get(job_url, headers=self.headers, allow_redirects=True)
        description_full = job_url_direct = None
        if res.ok:
            soup = BeautifulSoup(res.text, "html.parser")
            job_descr_div = soup.find("div", class_="job_description")
            company_descr_section = soup.find("section", class_="company_description")
            job_description_clean = (
                remove_attributes(job_descr_div).prettify(formatter="html")
                if job_descr_div
                else ""
            )
            company_description_clean = (
                remove_attributes(company_descr_section).prettify(formatter="html")
                if company_descr_section
                else ""
            )
            description_full = job_description_clean + company_description_clean
            script_tag = soup.find("script", type="application/json")
            if script_tag:
                job_json = json.loads(script_tag.string)
                job_url_val = job_json["model"]["saveJobURL"]
                m = re.search(r"job_url=(.+)", job_url_val)
                if m:
                    job_url_direct = m.group(1)
            if self.scraper_input.description_format == DescriptionFormat.MARKDOWN:
                description_full = markdown_converter(description_full)
        return description_full, job_url_direct
    def _get_cookies(self):
        data = "event_type=session&logged_in=false&number_of_retry=1&property=model%3AiPhone&property=os%3AiOS&property=locale%3Aen_us&property=app_build_number%3A4734&property=app_version%3A91.0&property=manufacturer%3AApple&property=timestamp%3A2024-01-12T12%3A04%3A42-06%3A00&property=screen_height%3A852&property=os_version%3A16.6.1&property=source%3Ainstall&property=screen_width%3A393&property=device_model%3AiPhone%2014%20Pro&property=brand%3AApple"
        url = f"{self.api_url}/jobs-app/event"
Author	SHA1	Message	Date
Cullen Watson	ccb0c17660	enh: ziprecruiter full description (#162 )	2024-06-09 16:21:01 -05:00
Cullen Watson	df339610fa	docs: readme	2024-05-29 19:32:32 -05:00
Cullen Watson	c501006bd8	docs: readme	2024-05-28 16:04:26 -05:00
Cullen Watson	89a3ee231c	enh(li): job function (#160 )	2024-05-28 16:01:29 -05:00
Cullen	6439f71433	chore: version	2024-05-28 15:39:24 -05:00
adamagassi	7f6271b2e0	LinkedIn scraper fixes: (#159 ) Correct initial page offset calculation Separate page variable from request counter Fix job offset starting value Increment offset by number of jobs returned instead of expected value	2024-05-28 15:38:13 -05:00