mirror of https://github.com/Bunsly/JobSpy
parent
5b3627b244
commit
b97c73ffd6
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "python-jobspy"
|
name = "python-jobspy"
|
||||||
version = "1.1.35"
|
version = "1.1.36"
|
||||||
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter"
|
||||||
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
authors = ["Zachary Hampton <zachary@bunsly.com>", "Cullen Watson <cullen@bunsly.com>"]
|
||||||
homepage = "https://github.com/Bunsly/JobSpy"
|
homepage = "https://github.com/Bunsly/JobSpy"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Optional
|
from typing import Optional
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class JobType(Enum):
|
class JobType(Enum):
|
||||||
|
|
|
@ -14,7 +14,7 @@ from ..utils import count_urgent_words, extract_emails_from_text
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import GlassdoorException
|
from ..exceptions import GlassdoorException
|
||||||
from ..utils import create_session
|
from ..utils import create_session, modify_and_get_description
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
Compensation,
|
Compensation,
|
||||||
|
@ -200,9 +200,7 @@ class GlassdoorScraper(Scraper):
|
||||||
data = response.json()[0]
|
data = response.json()[0]
|
||||||
desc = data['data']['jobview']['job']['description']
|
desc = data['data']['jobview']['job']['description']
|
||||||
soup = BeautifulSoup(desc, 'html.parser')
|
soup = BeautifulSoup(desc, 'html.parser')
|
||||||
description = soup.get_text(separator='\n')
|
return modify_and_get_description(soup)
|
||||||
|
|
||||||
return description
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_compensation(data: dict) -> Optional[Compensation]:
|
def parse_compensation(data: dict) -> Optional[Compensation]:
|
||||||
|
@ -292,12 +290,11 @@ class GlassdoorScraper(Scraper):
|
||||||
for job_type in JobType:
|
for job_type in JobType:
|
||||||
if job_type_str in job_type.value:
|
if job_type_str in job_type.value:
|
||||||
return [job_type]
|
return [job_type]
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_location(location_name: str) -> Location:
|
def parse_location(location_name: str) -> Location | None:
|
||||||
if not location_name or location_name == "Remote":
|
if not location_name or location_name == "Remote":
|
||||||
return None
|
return
|
||||||
city, _, state = location_name.partition(", ")
|
city, _, state = location_name.partition(", ")
|
||||||
return Location(city=city, state=state)
|
return Location(city=city, state=state)
|
||||||
|
|
||||||
|
@ -306,7 +303,6 @@ class GlassdoorScraper(Scraper):
|
||||||
for cursor_data in pagination_cursors:
|
for cursor_data in pagination_cursors:
|
||||||
if cursor_data["pageNumber"] == page_num:
|
if cursor_data["pageNumber"] == page_num:
|
||||||
return cursor_data["cursor"]
|
return cursor_data["cursor"]
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def headers() -> dict:
|
def headers() -> dict:
|
||||||
|
|
|
@ -21,6 +21,7 @@ from ..utils import (
|
||||||
extract_emails_from_text,
|
extract_emails_from_text,
|
||||||
create_session,
|
create_session,
|
||||||
get_enum_from_job_type,
|
get_enum_from_job_type,
|
||||||
|
modify_and_get_description
|
||||||
)
|
)
|
||||||
from ...jobs import (
|
from ...jobs import (
|
||||||
JobPost,
|
JobPost,
|
||||||
|
@ -247,9 +248,7 @@ class IndeedScraper(Scraper):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(job_description, "html.parser")
|
soup = BeautifulSoup(job_description, "html.parser")
|
||||||
text_content = "\n".join(soup.stripped_strings)
|
return modify_and_get_description(soup)
|
||||||
|
|
||||||
return text_content
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_job_type(job: dict) -> list[JobType] | None:
|
def get_job_type(job: dict) -> list[JobType] | None:
|
||||||
|
|
|
@ -4,23 +4,36 @@ jobspy.scrapers.linkedin
|
||||||
|
|
||||||
This module contains routines to scrape LinkedIn.
|
This module contains routines to scrape LinkedIn.
|
||||||
"""
|
"""
|
||||||
|
import time
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import time
|
|
||||||
from requests.exceptions import ProxyError
|
from requests.exceptions import ProxyError
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from bs4.element import Tag
|
|
||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
from bs4.element import Tag
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import LinkedInException
|
from ..exceptions import LinkedInException
|
||||||
from ..utils import create_session
|
from ..utils import create_session
|
||||||
from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation
|
from ...jobs import (
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser
|
JobPost,
|
||||||
|
Location,
|
||||||
|
JobResponse,
|
||||||
|
JobType,
|
||||||
|
Country,
|
||||||
|
Compensation
|
||||||
|
)
|
||||||
|
from ..utils import (
|
||||||
|
count_urgent_words,
|
||||||
|
extract_emails_from_text,
|
||||||
|
get_enum_from_job_type,
|
||||||
|
currency_parser,
|
||||||
|
modify_and_get_description
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class LinkedInScraper(Scraper):
|
class LinkedInScraper(Scraper):
|
||||||
|
@ -213,7 +226,7 @@ class LinkedInScraper(Scraper):
|
||||||
|
|
||||||
description = None
|
description = None
|
||||||
if div_content:
|
if div_content:
|
||||||
description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip())
|
description = modify_and_get_description(div_content)
|
||||||
|
|
||||||
def get_job_type(
|
def get_job_type(
|
||||||
soup_job_type: BeautifulSoup,
|
soup_job_type: BeautifulSoup,
|
||||||
|
|
|
@ -8,6 +8,15 @@ from requests.adapters import HTTPAdapter, Retry
|
||||||
from ..jobs import JobType
|
from ..jobs import JobType
|
||||||
|
|
||||||
|
|
||||||
|
def modify_and_get_description(soup):
|
||||||
|
for li in soup.find_all('li'):
|
||||||
|
li.string = "- " + li.get_text()
|
||||||
|
|
||||||
|
description = soup.get_text(separator='\n').strip()
|
||||||
|
description = re.sub(r'\n+', '\n', description)
|
||||||
|
return description
|
||||||
|
|
||||||
|
|
||||||
def count_urgent_words(description: str) -> int:
|
def count_urgent_words(description: str) -> int:
|
||||||
"""
|
"""
|
||||||
Count the number of urgent words or phrases in a job description.
|
Count the number of urgent words or phrases in a job description.
|
||||||
|
|
|
@ -10,14 +10,13 @@ import re
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from typing import Optional, Tuple, Any
|
from typing import Optional, Tuple, Any
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from .. import Scraper, ScraperInput, Site
|
from .. import Scraper, ScraperInput, Site
|
||||||
from ..exceptions import ZipRecruiterException
|
from ..exceptions import ZipRecruiterException
|
||||||
from ..utils import count_urgent_words, extract_emails_from_text, create_session
|
|
||||||
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
|
from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country
|
||||||
|
from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description
|
||||||
|
|
||||||
|
|
||||||
class ZipRecruiterScraper(Scraper):
|
class ZipRecruiterScraper(Scraper):
|
||||||
|
@ -107,9 +106,9 @@ class ZipRecruiterScraper(Scraper):
|
||||||
title = job.get("name")
|
title = job.get("name")
|
||||||
job_url = job.get("job_url")
|
job_url = job.get("job_url")
|
||||||
|
|
||||||
description = BeautifulSoup(
|
job_description_html = job.get("job_description", "").strip()
|
||||||
job.get("job_description", "").strip(), "html.parser"
|
description_soup = BeautifulSoup(job_description_html, "html.parser")
|
||||||
).get_text(separator="\n")
|
description = modify_and_get_description(description_soup)
|
||||||
|
|
||||||
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
company = job["hiring_company"].get("name") if "hiring_company" in job else None
|
||||||
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
country_value = "usa" if job.get("job_country") == "US" else "canada"
|
||||||
|
|
Loading…
Reference in New Issue