From d63d102dbc30ec81d08204c85956a8be2ac1af19 Mon Sep 17 00:00:00 2001 From: yiwen Date: Mon, 2 Oct 2023 19:32:00 +0800 Subject: [PATCH] mongo connection setting --- examples/JobSpy_Demo.py | 48 +- src/__init__.py | 0 src/dao/mongoDAO.py | 29 + src/jobspy/scrapers/indeed/__init__.py | 1 + src/location_seed.json | 961 +++++++++++++++++++++++++ src/scrape.py | 39 + src/title_seed.json | 79 ++ src/tools/tools.py | 16 + 8 files changed, 1149 insertions(+), 24 deletions(-) delete mode 100644 src/__init__.py create mode 100644 src/dao/mongoDAO.py create mode 100644 src/location_seed.json create mode 100644 src/scrape.py create mode 100644 src/title_seed.json create mode 100644 src/tools/tools.py diff --git a/examples/JobSpy_Demo.py b/examples/JobSpy_Demo.py index 17755d9..1ccbcac 100644 --- a/examples/JobSpy_Demo.py +++ b/examples/JobSpy_Demo.py @@ -19,27 +19,27 @@ pd.set_option('display.width', None) pd.set_option('display.max_colwidth', 50) # set to 0 to see full job url / desc # fetch jobs for each location -locations = read_location_list('location_seed.json') -for location in locations: - try: - jobs: pd.DataFrame = scrape_jobs( - # site_name=["indeed", "linkedin", "zip_recruiter"], - site_name=["indeed"], - search_term="software engineer", - location=location, - results_wanted=30, - # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) - country_indeed='USA', - # offset=25 # start jobs from an offset (use if search failed and want to continue) - proxy="http://34.120.172.140:8123", - # proxy="http://crawler-gost-proxy.jobright-internal.com:8080", - ) - except Exception as e: - print(f'Error when process: {location}') - print(e) - continue - print(f'{location}: {jobs.shape[0]} rows append.') - if os.path.isfile('./jobs.csv'): - jobs.to_csv('./jobs.csv', index=False, mode='a', header=False) - else: - jobs.to_csv('./jobs.csv', index=False, mode='a', header=True) +# locations = read_location_list('location_seed.json') +# for location in locations: +# try: +# jobs: pd.DataFrame = scrape_jobs( +# # site_name=["indeed", "linkedin", "zip_recruiter"], +# site_name=["indeed"], +# search_term="software engineer", +# location=location, +# results_wanted=30, +# # be wary the higher it is, the more likey you'll get blocked (rotating proxy should work tho) +# country_indeed='USA', +# # offset=25 # start jobs from an offset (use if search failed and want to continue) +# proxy="http://34.120.172.140:8123", +# # proxy="http://crawler-gost-proxy.jobright-internal.com:8080", +# ) +# except Exception as e: +# print(f'Error when process: {location}') +# print(e) +# continue +# print(f'{location}: {jobs.shape[0]} rows append.') +# if os.path.isfile('./jobs.csv'): +# jobs.to_csv('./jobs.csv', index=False, mode='a', header=False) +# else: +# jobs.to_csv('./jobs.csv', index=False, mode='a', header=True) diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/dao/mongoDAO.py b/src/dao/mongoDAO.py new file mode 100644 index 0000000..da5cf3c --- /dev/null +++ b/src/dao/mongoDAO.py @@ -0,0 +1,29 @@ +from pymongo import MongoClient + + +class MongoDBHelper: + def __init__(self, + connection_host="mongodb://mongos.mongos:73hck*euuDyU!JXikCTV@172.31.57.134:27017", + database_name='indeed'): + self.client = MongoClient(connection_host) + self.database = self.client[database_name] + + def disconnect(self): + if self.client is not None: + self.client.close() + self.client = None + self.database = None + + def insert_one(self, collection_name, data): + collection = self.database[collection_name] + result = collection.insert_one(data) + return result.inserted_id + + def insert_all(self, collection_name, data_list): + collection = self.database[collection_name] + result = collection.insert_many(data_list) + return result.inserted_ids + + def find(self, collection_name, query): + collection = self.database[collection_name] + return collection.find(query) diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index 4c2a677..357a5ad 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -70,6 +70,7 @@ class IndeedScraper(Scraper): "l": scraper_input.location, "filter": 0, "start": scraper_input.offset + page * 10, + 'fromage': 1 # only need jobs posted in recent 24h } if scraper_input.distance: params["radius"] = scraper_input.distance diff --git a/src/location_seed.json b/src/location_seed.json new file mode 100644 index 0000000..92b62c6 --- /dev/null +++ b/src/location_seed.json @@ -0,0 +1,961 @@ +[ + "new york,NY", + "houston,TX", + "chicago,IL", + "brooklyn,NY", + "los angeles,CA", + "miami,FL", + "san antonio,TX", + "philadelphia,PA", + "las vegas,NV", + "bronx,NY", + "phoenix,AZ", + "dallas,TX", + "san diego,CA", + "minneapolis,MN", + "san jose,CA", + "denver,CO", + "austin,TX", + "st. louis,MO", + "indianapolis,IN", + "atlanta,GA", + "tucson,AZ", + "orlando,FL", + "portland,OR", + "seattle,WA", + "san francisco,CA", + "fort worth,TX", + "jacksonville,FL", + "milwaukee,WI", + "cincinnati,OH", + "charlotte,NC", + "columbus,OH", + "cleveland,OH", + "fort lauderdale,FL", + "sacramento,CA", + "saint paul,MN", + "el paso,TX", + "louisville,KY", + "tampa,FL", + "memphis,TN", + "pittsburgh,PA", + "detroit,MI", + "albuquerque,NM", + "oklahoma city,OK", + "washington,DC", + "fresno,CA", + "buffalo,NY", + "kansas city,MO", + "colorado springs,CO", + "bakersfield,CA", + "omaha,NE", + "birmingham,AL", + "raleigh,NC", + "dayton,OH", + "mesa,AZ", + "rochester,NY", + "long beach,CA", + "staten island,NY", + "salt lake city,UT", + "virginia beach,VA", + "nashville,TN", + "pompano beach,FL", + "hollywood,FL", + "riverside,CA", + "tulsa,OK", + "wichita,KS", + "honolulu,HI", + "knoxville,TN", + "aurora,CO", + "new orleans,LA", + "oakland,CA", + "baton rouge,LA", + "arlington,TX", + "richmond,VA", + "stockton,CA", + "anaheim,CA", + "grand rapids,MI", + "santa ana,CA", + "hialeah,FL", + "spokane,WA", + "saint petersburg,FL", + "west palm beach,FL", + "tacoma,WA", + "toledo,OH", + "spring,TX", + "corpus christi,TX", + "fort wayne,IN", + "greensboro,NC", + "littleton,CO", + "marietta,GA", + "vancouver,WA", + "lexington,KY", + "silver spring,MD", + "glendale,AZ", + "plano,TX", + "newark,NJ", + "reno,NV", + "naples,FL", + "tallahassee,FL", + "madison,WI", + "akron,OH", + "mobile,AL", + "scottsdale,AZ", + "lincoln,NE", + "henderson,NV", + "durham,NC", + "chandler,AZ", + "modesto,CA", + "pensacola,FL", + "lubbock,TX", + "katy,TX", + "winston-salem,NC", + "salem,OR", + "laredo,TX", + "jersey city,NJ", + "norfolk,VA", + "anchorage,AK", + "chula vista,CA", + "fayetteville,NC", + "lakeland,FL", + "san bernardino,CA", + "boise,ID", + "corona,CA", + "sarasota,FL", + "paradise,NV", + "north hempstead,NY", + "syracuse,NY", + "savannah,GA", + "lawrenceville,GA", + "kissimmee,FL", + "shreveport,LA", + "garland,TX", + "trenton,NJ", + "fort myers,FL", + "chesapeake,VA", + "fontana,CA", + "amarillo,TX", + "springfield,MO", + "irving,TX", + "bradenton,FL", + "lake worth,FL", + "wilmington,DE", + "boston,MA", + "montgomery,AL", + "north las vegas,NV", + "little rock,AR", + "fremont,CA", + "irvine,CA", + "des moines,IA", + "oxnard,CA", + "gilbert,AZ", + "boca raton,FL", + "ocala,FL", + "aurora,IL", + "providence,RI", + "augusta,GA", + "chattanooga,TN", + "gainesville,FL", + "brownsville,TX", + "spring valley,NV", + "moreno valley,CA", + "sunrise manor,NV", + "eugene,OR", + "huntington beach,CA", + "huntsville,AL", + "whittier,CA", + "port st. lucie,FL", + "rockford,IL", + "oceanside,CA", + "erie,PA", + "ogden,UT", + "wilmington,NC", + "woodbridge,VA", + "beaverton,OR", + "evansville,IN", + "lancaster,CA", + "salinas,CA", + "worcester,MA", + "newport news,VA", + "green bay,WI", + "yonkers,NY", + "hayward,CA", + "fort collins,CO", + "york,PA", + "jackson,MS", + "grand prairie,TX", + "torrance,CA", + "columbus,GA", + "glendale,CA", + "garden grove,CA", + "clearwater,FL", + "escondido,CA", + "palmdale,CA", + "overland park,KS", + "decatur,GA", + "new haven,CT", + "tempe,AZ", + "mission,TX", + "olympia,WA", + "ontario,CA", + "allentown,PA", + "macon,GA", + "roanoke,VA", + "alpharetta,GA", + "harrisburg,PA", + "rancho cucamonga,CA", + "el cajon,CA", + "topeka,KS", + "youngstown,OH", + "flint,MI", + "elk grove,CA", + "schenectady,NY", + "lansing,MI", + "sioux falls,SD", + "naperville,IL", + "kalamazoo,MI", + "murfreesboro,TN", + "ann arbor,MI", + "peoria,AZ", + "cape coral,FL", + "fredericksburg,VA", + "everett,WA", + "boynton beach,FL", + "homestead,FL", + "lakewood,CO", + "pomona,CA", + "north township,IN", + "pasadena,CA", + "fairfax,VA", + "pueblo,CO", + "clarksville,TN", + "lafayette,LA", + "albany,NY", + "hyattsville,MD", + "kansas city,KS", + "sugar land,TX", + "waco,TX", + "kent,WA", + "canton,OH", + "paterson,NJ", + "pasadena,TX", + "south bend,IN", + "mckinney,TX", + "bridgeport,CT", + "tyler,TX", + "springfield,MA", + "las cruces,NM", + "beaumont,TX", + "visalia,CA", + "springfield,IL", + "yuma,AZ", + "edmond,OK", + "orange,CA", + "renton,WA", + "mesquite,TX", + "sunnyvale,CA", + "miramar,FL", + "melbourne,FL", + "denton,TX", + "round rock,TX", + "columbia,MO", + "newark,DE", + "manassas,VA", + "odessa,TX", + "metairie,LA", + "ramapo,NY", + "peoria,IL", + "saginaw,MI", + "gaithersburg,MD", + "inglewood,CA", + "compton,CA", + "fullerton,CA", + "saint charles,MO", + "hamilton,OH", + "midland,TX", + "cedar rapids,IA", + "killeen,TX", + "warren,MI", + "santa maria,CA", + "santa barbara,CA", + "san mateo,CA", + "mcallen,TX", + "cary,NC", + "bellevue,WA", + "humble,TX", + "rockville,MD", + "victorville,CA", + "conroe,TX", + "olathe,KS", + "west valley city,UT", + "sterling heights,MI", + "cumming,GA", + "puyallup,WA", + "muskegon,MI", + "joliet,IL", + "billings,MT", + "racine,WI", + "bloomington,IN", + "simi valley,CA", + "yakima,WA", + "lake charles,LA", + "abilene,TX", + "hartford,CT", + "concord,CA", + "independence,MO", + "bethlehem,PA", + "roseville,CA", + "cypress,TX", + "stamford,CT", + "toms river,NJ", + "arvada,CO", + "boulder,CO", + "gainesville,GA", + "asheville,NC", + "frederick,MD", + "frisco,TX", + "surprise,AZ", + "carrollton,TX", + "myrtle beach,SC", + "vallejo,CA", + "berkeley,CA", + "panama city,FL", + "rochester,MN", + "hemet,CA", + "bellingham,WA", + "broken arrow,OK", + "longmont,CO", + "elgin,IL", + "duluth,MN", + "appleton,WI", + "falls church,VA", + "high point,NC", + "murrieta,CA", + "santa clara,CA", + "la puente,CA", + "spring hill,FL", + "new port richey,FL", + "temecula,CA", + "norman,OK", + "west jordan,UT", + "provo,UT", + "kenosha,WI", + "tuscaloosa,AL", + "ventura,CA", + "pearland,TX", + "charlottesville,VA", + "stone mountain,GA", + "vista,CA", + "downey,CA", + "redding,CA", + "costa mesa,CA", + "waterbury,CT", + "centennial,CO", + "sparks,NV", + "st. augustine,FL", + "clovis,CA", + "miami beach,FL", + "lewisville,TX", + "florissant,MO", + "greenville,NC", + "west covina,CA", + "fargo,ND", + "daly city,CA", + "rock hill,SC", + "delray beach,FL", + "spartanburg,SC", + "bothell,WA", + "burbank,CA", + "sandy springs,GA", + "lowell,MA", + "tracy,CA", + "perris,CA", + "chico,CA", + "jurupa valley,CA", + "fairfield,CA", + "vero beach,FL", + "sandy,UT", + "norwalk,CA", + "bend,OR", + "cambridge,MA", + "douglasville,GA", + "carlsbad,CA", + "davie,FL", + "college station,TX", + "palm bay,FL", + "san leandro,CA", + "concord,NC", + "nampa,ID", + "redwood city,CA", + "springfield,VA", + "bowling green,KY", + "jackson,MI", + "duluth,GA", + "san angelo,TX", + "largo,FL", + "auburn,WA", + "baytown,TX", + "grand junction,CO", + "antioch,CA", + "gastonia,NC", + "davenport,IA", + "longview,TX", + "laurel,MD", + "wichita falls,TX", + "springfield,OH", + "idaho falls,ID", + "albany,GA", + "rialto,CA", + "greeley,CO", + "bell gardens,CA", + "scranton,PA", + "edison,NJ", + "englewood,CO", + "hesperia,CA", + "vacaville,CA", + "daytona beach,FL", + "richardson,TX", + "ypsilanti,MI", + "lynchburg,VA", + "madera,CA", + "waukegan,IL", + "arden-arcade,CA", + "elizabeth,NJ", + "dearborn,MI", + "lafayette,IN", + "santa cruz,CA", + "spokane valley,WA", + "merced,CA", + "richmond,TX", + "elkhart,IN", + "rochester,MI", + "livonia,MI", + "lee's summit,MO", + "charleston,WV", + "winter haven,FL", + "belleville,IL", + "roswell,GA", + "terre haute,IN", + "bowie,MD", + "hagerstown,MD", + "columbia,MD", + "keller,TX", + "waukesha,WI", + "new bedford,MA", + "mission viejo,CA", + "plainfield,IL", + "south gate,CA", + "sunrise,FL", + "lawrence,KS", + "napa,CA", + "muncie,IN", + "brockton,MA", + "valdosta,GA", + "anderson,SC", + "lawton,OK", + "allen,TX", + "hawthorne,CA", + "mansfield,OH", + "hillsboro,OR", + "lynnwood,WA", + "lakewood township,NJ", + "kennesaw,GA", + "ballwin,MO", + "palo alto,CA", + "broomfield,CO", + "quincy,MA", + "battle creek,MI", + "annapolis,MD", + "johnson city,TN", + "santa monica,CA", + "jacksonville,NC", + "acworth,GA", + "missouri city,TX", + "federal way,WA", + "chapel hill,NC", + "greenburgh,NY", + "missoula,MT", + "el monte,CA", + "mcdonough,GA", + "franklin,TN", + "rapid city,SD", + "lynn,MA", + "lithonia,GA", + "canton,MI", + "kennewick,WA", + "fort smith,AR", + "des plaines,IL", + "westminster,CA", + "holland,MI", + "jupiter,FL", + "o'fallon,MO", + "fayetteville,AR", + "bryan,TX", + "san marcos,CA", + "woodstock,GA", + "fort pierce,FL", + "rio rancho,NM", + "cleveland,TN", + "champaign,IL", + "deltona,FL", + "orem,UT", + "midlothian,VA", + "fall river,MA", + "apopka,FL", + "farmington,MI", + "decatur,IL", + "morgantown,WV", + "indio,CA", + "harlingen,TX", + "kingsport,TN", + "herndon,VA", + "saint joseph,MO", + "lima,OH", + "poughkeepsie,NY", + "monroe,LA", + "medford,OR", + "nashua,NH", + "jackson,TN", + "clarkstown,NY", + "cheyenne,WY", + "covington,GA", + "norristown,PA", + "westerville,OH", + "sioux city,IA", + "bloomington,MN", + "upper marlboro,MD", + "springdale,AR", + "riverview,FL", + "norwalk,CT", + "williamsburg,VA", + "gary,IN", + "buford,GA", + "slidell,LA", + "hoover,AL", + "lehigh acres,FL", + "parker,CO", + "bremerton,WA", + "newnan,GA", + "pittsburg,CA", + "cicero,IL", + "suffolk,VA", + "flagstaff,AZ", + "johns creek,GA", + "bloomington,IL", + "westland,MI", + "meridian,ID", + "gardena,CA", + "citrus heights,CA", + "sumter,SC", + "livermore,CA", + "chino,CA", + "easton,PA", + "conyers,GA", + "clifton,NJ", + "gulfport,MS", + "houma,LA", + "milton,FL", + "carson,CA", + "alhambra,CA", + "glen burnie,MD", + "germantown,MD", + "loveland,CO", + "hattiesburg,MS", + "walnut creek,CA", + "watsonville,CA", + "ashburn,VA", + "binghamton,NY", + "clermont,FL", + "maryville,TN", + "owensboro,KY", + "jonesboro,GA", + "warner robins,GA", + "eau claire,WI", + "farmington hills,MI", + "troy,MI", + "danbury,CT", + "tustin,CA", + "oshkosh,WI", + "brooklyn park,MN", + "dalton,GA", + "dothan,AL", + "ellicott city,MD", + "buena park,CA", + "bethesda,MD", + "arlington heights,IL", + "lakewood,CA", + "sterling,VA", + "brandon,FL", + "macomb township,MI", + "palm coast,FL", + "dallas,GA", + "anderson,IN", + "middletown,OH", + "parma,OH", + "evanston,IL", + "hickory,NC", + "greenwood,IN", + "covina,CA", + "new braunfels,TX", + "utica,MI", + "valparaiso,IN", + "thousand oaks,CA", + "sanford,FL", + "bismarck,ND", + "waldorf,MD", + "pocatello,ID", + "bloomington township,IL", + "gresham,OR", + "palatine,IL", + "camarillo,CA", + "iowa city,IA", + "apple valley,CA", + "warwick,RI", + "turlock,CA", + "jonesboro,AR", + "conway,AR", + "redmond,WA", + "stafford,VA", + "carmel,IN", + "upland,CA", + "porterville,CA", + "lexington,NC", + "monroe,NC", + "yuba city,CA", + "bellflower,CA", + "baldwin park,CA", + "lawrence,MA", + "norcross,GA", + "schaumburg,IL", + "warren,OH", + "southfield,MI", + "smyrna,GA", + "greer,SC", + "redondo beach,CA", + "huntington park,CA", + "huntington,WV", + "st. george,UT", + "avondale,AZ", + "chino hills,CA", + "kirkland,WA", + "wilkes-barre,PA", + "mountain view,CA", + "snellville,GA", + "plant city,FL", + "st. cloud,MN", + "palm harbor,FL", + "davis,CA", + "simpsonville,SC", + "manteca,CA", + "rochester hills,MI", + "brick,NJ", + "springfield,OR", + "somerville,MA", + "alameda,CA", + "redlands,CA", + "leesburg,VA", + "fishers,IN", + "hammond,IN", + "brighton,CO", + "san ramon,CA", + "new rochelle,NY", + "bolingbrook,IL", + "kokomo,IN", + "new britain,CT", + "marysville,WA", + "temple,TX", + "lodi,CA", + "mount pleasant,SC", + "janesville,WI", + "waipahu,HI", + "joplin,MO", + "opa-locka,FL", + "folsom,CA", + "great falls,MT", + "goldsboro,NC", + "oviedo,FL", + "rocky mount,NC", + "lauderhill,FL", + "rogers,AR", + "utica,NY", + "council bluffs,IA", + "pleasanton,CA", + "san tan valley,AZ", + "north charleston,SC", + "orange park,FL", + "league city,TX", + "waterford township,MI", + "plainfield,NJ", + "winter park,FL", + "bessemer,AL", + "pawtucket,RI", + "denham springs,LA", + "johnstown,PA", + "dubuque,IA", + "cherry hill,NJ", + "troy,NY", + "weston,FL", + "cranston,RI", + "pharr,TX", + "petaluma,CA", + "san rafael,CA", + "winchester,VA", + "traverse city,MI", + "white plains,NY", + "la mesa,CA", + "passaic,NJ", + "bossier city,LA", + "mooresville,NC", + "woodbury,MN", + "parkville,MD", + "lynwood,CA", + "tulare,CA", + "union city,CA", + "aiken,SC", + "shawnee,KS", + "north little rock,AR", + "dover,DE", + "statesville,NC", + "casper,WY", + "dublin,OH", + "casas adobes,AZ", + "lake elsinore,CA", + "pflugerville,TX", + "brandon,MS", + "port charlotte,FL", + "centreville,VA", + "jefferson city,MO", + "palm beach gardens,FL", + "framingham,MA", + "kenner,LA", + "pasco,WA", + "sanford,NC", + "tomball,TX", + "lorain,OH", + "grants pass,OR", + "bay city,MI", + "layton,UT", + "matthews,NC", + "la habra,CA", + "fort mill,SC", + "harrisonburg,VA", + "columbus,IN", + "castle rock,CO", + "spring valley,CA", + "mount vernon,NY", + "huntington station,NY", + "meridian,MS", + "gadsden,AL", + "mechanicsburg,PA", + "goodyear,AZ", + "stuart,FL", + "west lafayette,IN", + "milpitas,CA", + "hendersonville,NC", + "ithaca,NY", + "eagan,MN", + "yorba linda,CA", + "union city,NJ", + "state college,PA", + "port orange,FL", + "west bloomfield township,MI", + "stockbridge,GA", + "arcadia,CA", + "cocoa,FL", + "tamarac,FL", + "massillon,OH", + "glen allen,VA", + "hanford,CA", + "portland,ME", + "corvallis,OR", + "weatherford,TX", + "cedar park,TX", + "new bern,NC", + "tinley park,IL", + "cookeville,TN", + "decatur,AL", + "midland,MI", + "flower mound,TX", + "orland park,IL", + "suwanee,GA", + "blaine,MN", + "noblesville,IN", + "east orange,NJ", + "elyria,OH", + "punta gorda,FL", + "brooksville,FL", + "logan,UT", + "west hartford,CT", + "venice,FL", + "levittown,PA", + "fairbanks,AK", + "south san francisco,CA", + "east lansing,MI", + "buckeye,AZ", + "weslaco,TX", + "san marcos,TX", + "florence-graham,CA", + "bel air,MD", + "san clemente,CA", + "madison,AL", + "mechanicsville,VA", + "la crosse,WI", + "huntsville,TX", + "sheboygan,WI", + "san luis obispo,CA", + "berwyn,IL", + "altoona,PA", + "laguna niguel,CA", + "vienna,VA", + "florence,AL", + "eastvale,CA", + "taylor,MI", + "lufkin,TX", + "bayonne,NJ", + "newport beach,CA", + "pico rivera,CA", + "ames,IA", + "pottstown,PA", + "ewa beach,HI", + "coon rapids,MN", + "fayetteville,GA", + "casa grande,AZ", + "montebello,CA", + "bay shore,NY", + "clearfield,UT", + "san gabriel,CA", + "north miami,FL", + "middletown,NY", + "lakewood,WA", + "rosemead,CA", + "medina,OH", + "north port,FL", + "griffin,GA", + "valrico,FL", + "port orchard,WA", + "ormond beach,FL", + "carson city,NV", + "north richland hills,TX", + "deland,FL", + "moore,OK", + "saint cloud,FL", + "caldwell,ID", + "doral,FL", + "conway,SC", + "hamden,CT", + "valley stream,NY", + "biloxi,MS", + "meriden,CT", + "manhattan,KS", + "west new york,NJ", + "haverhill,MA", + "eden prairie,MN", + "north bergen,NJ", + "fond du lac,WI", + "brentwood,NY", + "rancho cordova,CA", + "cupertino,CA", + "vineland,NJ", + "texarkana,TX", + "snohomish,WA", + "mankato,MN", + "waltham,MA", + "crown point,IN", + "pasadena,MD", + "new iberia,LA", + "euless,TX", + "bristol,CT", + "bellevue,NE", + "danville,VA", + "powder springs,GA", + "burnsville,MN", + "skokie,IL", + "monterey park,CA", + "mentor,OH", + "millcreek,UT", + "cartersville,GA", + "easley,SC", + "petersburg,VA", + "round lake,IL", + "south whittier,CA", + "taylorsville,UT", + "west allis,WI", + "pontiac,MI", + "wilson,NC", + "woodland,CA", + "carrollton,GA", + "reston,VA", + "kendale lakes,FL", + "burleson,TX", + "phenix city,AL", + "azusa,CA", + "saint clair shores,MI", + "crystal lake,IL", + "lilburn,GA", + "newark,OH", + "lancaster,OH", + "malden,MA", + "wheaton,IL", + "greensburg,PA", + "loganville,GA", + "carmichael,CA", + "hempstead,NY", + "hendersonville,TN", + "fontainebleau,FL", + "new castle,DE", + "lake forest,CA", + "bartlett,TN", + "apex,NC", + "spring valley,NY", + "the hammocks,FL", + "albany,OR", + "bozeman,MT", + "martinsburg,WV", + "wesley chapel,FL", + "novato,CA", + "grove city,OH", + "east saint louis,IL", + "tamiami,FL", + "chicago heights,IL", + "medford,MA", + "chillicothe,OH", + "farmington,NM", + "dearborn heights,MI", + "riverdale,GA", + "howell,MI", + "lake city,FL", + "paducah,KY", + "golden,CO", + "yukon,OK", + "lutz,FL", + "lompoc,CA", + "gilroy,CA", + "midwest city,OK", + "rocklin,CA", + "royal oak,MI", + "westminster,MD", + "roswell,NM", + "oak lawn,IL", + "mchenry,IL", + "goshen,IN", + "new baltimore,MI", + "castro valley,CA", + "downers grove,IL", + "colton,CA", + "painesville,OH", + "altamonte springs,FL", + "marrero,LA", + "st. charles,IL", + "plymouth,MA", + "freehold township,NJ", + "butler,PA", + "shoreline,WA", + "rockwall,TX", + "palm desert,CA", + "blue springs,MO", + "fountain valley,CA", + "annandale,VA", + "piscataway township,NJ", + "mishawaka,IN", + "monroe,MI" +] \ No newline at end of file diff --git a/src/scrape.py b/src/scrape.py new file mode 100644 index 0000000..994a299 --- /dev/null +++ b/src/scrape.py @@ -0,0 +1,39 @@ +from dao.mongoDAO import MongoDBHelper +from tools.tools import get_location_list, get_title_list +import os +from jobspy import scrape_jobs +import pandas as pd +from jobspy.jobs import JobPost + +# for write date to mongo +mongo_helper = MongoDBHelper() + +# locations, titles for search +locations = get_location_list() +titles = get_title_list() + + +# write jobs to mongo +def write_jobs_to_mongo(job_list: [JobPost], mongo: MongoDBHelper): + print(job_list) + # mongo.insert_all(job_list) + + +for location in locations: + for title in titles: + try: + jobs: pd.DataFrame = scrape_jobs( + site_name=["indeed"], + search_term=title, + location=location, + results_wanted=30, + country_indeed='USA', + # offset=25 # start jobs from an offset (use if search failed and want to continue) + proxy="http://34.120.172.140:8123" + # proxy="http://crawler-gost-proxy.jobright-internal.com:8080" + ) + except Exception as e: + print(f'Error when process: [{location}][{title}]') + print(e) + continue + print(f'[{location}][{title}]: {jobs.shape[0]} rows append.') diff --git a/src/title_seed.json b/src/title_seed.json new file mode 100644 index 0000000..281d573 --- /dev/null +++ b/src/title_seed.json @@ -0,0 +1,79 @@ +[ + "Administrative Assistant", + "Executive Assistant", + "Office Manager", + "Project Manager", + "Program Manager", + "Technical Project Manager", + "IT Project Manager", + "Scrum Master", + "Business Analyst", + "Business Development Representative", + "Business Development Manager", + "Operations Manager", + "Operations Associate", + "Data Analyst", + "Data Scientist", + "Security Engineer", + "Backend Software Engineer", + "Frontend Software Engineer", + "Blockchain Engineer", + "Software Engineer", + "Java Developer", + "Full Stack Engineer", + "Python Developer", + ".Net Developer", + "Software Engineer in Test", + "Android Engineer", + "React Developer", + "Embedded Software Engineer", + "Support Engineer", + "Web Developer", + "iOS Developer", + "Data Engineer", + "Data Architect", + "Machine Learning Engineer", + "NLP Engineer", + "Artificial Intelligence Engineer", + "Network Engineer", + "Network Administrator", + "Systems Engineer", + "Systems Administrator", + "DevOps Engineer", + "Site Reliability Engineer", + "Electrical Engineer", + "Hardware Engineer", + "Hardware Design Engineer", + "Mechanical Engineer", + "Manufacturing Engineer", + "Civil Engineer", + "Structural Engineer", + "Controls Engineer", + "Quality Engineer", + "UX Researcher", + "UX Designer", + "Product Manager", + "Product Owner", + "Product Designer", + "HR Generalist", + "Technical Recruiter", + "Recruiting Coordinator", + "General Sales", + "Account Executive", + "Sales Account Manager", + "Brand Marketing Manager", + "Product Marketing Manager", + "Marketing Specialist", + "Accounting Manager", + "Accountant", + "Controller", + "Tax Manager", + "Audit Manager", + "Financial Analyst", + "Finance Manager", + "Financial Advisor", + "IT Support Specialist", + "Technical Support Engineer", + "Technical Support Specialist", + "Customer Service Representative" +] \ No newline at end of file diff --git a/src/tools/tools.py b/src/tools/tools.py new file mode 100644 index 0000000..f14e096 --- /dev/null +++ b/src/tools/tools.py @@ -0,0 +1,16 @@ +import json + + +def read_json_list(json_list_file): + with open(json_list_file) as f: + item_list = [item for item in json.load(f)] + print(f'finish read json list file: {json_list_file}, res len: {len(item_list)}.') + return item_list + + +def get_location_list(location_file='location_seed.json'): + return read_json_list(location_file) + + +def get_title_list(title_file='title_seed.json'): + return read_json_list(title_file)