traditional id scraper with BeautifulSoup
This commit is contained in:
parent
eb780faf40
commit
ab064ce5cf
68
scrape_ids.py
Normal file
68
scrape_ids.py
Normal file
@ -0,0 +1,68 @@
|
||||
from bs4 import BeautifulSoup as b
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
import concurrent.futures
|
||||
import config as cfg
|
||||
|
||||
def get_isurl(category_id): # "get itemSearchURL"
|
||||
|
||||
'''
|
||||
Gets raw JSON data fom FindingApi service call. Currently being used to
|
||||
get itemIDs from categories;
|
||||
'''
|
||||
params = {
|
||||
"OPERATION-NAME":'findItemsByCategory',
|
||||
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
|
||||
"SERVICE-VERSION":"1.13.0",
|
||||
"RESPONSE-DATA-FORMAT":"JSON",
|
||||
"categoryId":category_id,
|
||||
"paginationInput.entriesPerPage":"1",
|
||||
"paginationInput.PageNumber":1,
|
||||
"itemFilter(0).name":"Condition",
|
||||
"itemFilter(0).value":"Used",
|
||||
"itemFilter.name":"HideDuplicateItems",
|
||||
"itemFilter.value":"true",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
|
||||
params=params, timeout=24)
|
||||
response.raise_for_status()
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
print('connection error')
|
||||
return url
|
||||
try:
|
||||
data = response.json()
|
||||
i = 1 # NOTE approx 220 pages of listings per cat @ 35 items per page
|
||||
pg = "&_pgn={}".format(str(i))
|
||||
item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
|
||||
url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0].replace('&_pgn=1', pg)
|
||||
url = url+item_cond
|
||||
|
||||
except (AttributeError, KeyError):
|
||||
print('AttributeError or KeyError. Exiting')
|
||||
print(response.json())
|
||||
return data
|
||||
|
||||
return url
|
||||
|
||||
def threaded_urls(url):
|
||||
|
||||
urls = []
|
||||
with open('cat_list.txt') as jf:
|
||||
cat_list = json.load(jf)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
for future in executor.map(get_isurl, cat_list):
|
||||
urls.append(future)
|
||||
|
||||
return urls
|
||||
|
||||
html = requests.get(url).text
|
||||
soup = b(html, "html.parser")
|
||||
ids = list(ids = soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
|
||||
ids = [id['href'] for id in ids]
|
||||
ids = [re.findall(r"[\d]+(?=\?)", id) for id in ids]
|
||||
|
Loading…
Reference in New Issue
Block a user