ebay-ml-lister/scrape_ids.py

130 lines
3.8 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup as b
import re
import json
import requests
import concurrent.futures
import config as cfg
def get_isurl(category_id): # "get itemSearchURL"
'''
Gets raw JSON data fom FindingApi service call. Currently being used to
get itemIDs from categories;
'''
params = {
"OPERATION-NAME":'findItemsByCategory',
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
"SERVICE-VERSION":"1.13.0",
"RESPONSE-DATA-FORMAT":"JSON",
"categoryId":category_id,
"paginationInput.entriesPerPage":"1",
"paginationInput.PageNumber":1,
"itemFilter(0).name":"Condition",
"itemFilter(0).value":"Used",
"itemFilter.name":"HideDuplicateItems",
"itemFilter.value":"true",
}
try:
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
params=params, timeout=24)
response.raise_for_status()
except requests.exceptions.RequestException:
print('connection error')
return url
try:
data = response.json()
2021-12-01 20:43:14 +00:00
print(data)
# NOTE approx 220 pages of listings per cat @ 35 items per page
item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
2021-12-01 20:43:14 +00:00
item_cond_new = '&LH_ItemCondition=3'
urls = []
2021-12-01 20:43:14 +00:00
base_url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0]
for pg in list(range(1,34)): # No results after around page 32
url = base_url+"&_pgn="+str(pg)+item_cond
print(url)
urls.append(url)
except (AttributeError, KeyError):
print('AttributeError or KeyError. Exiting')
return urls
def threaded_urls():
urls = []
with open('cat_list.txt') as jf:
cat_list = json.load(jf)
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(get_isurl, cat_list):
urls.extend(future)
return urls
def get_ids(url):
'''
Scrapes listing links for item ID in url
'''
html = requests.get(url).text
soup = b(html, "html.parser")
2021-12-01 20:43:14 +00:00
print(soup)
ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
ids = [id['href'] for id in ids]
ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
2021-12-01 20:43:14 +00:00
print(ids)
return ids
def threaded_get_ids(urls):
2021-12-01 20:43:14 +00:00
'''
Runs get_ids() w/in ThreadPoolExecutor() for multi threaded requests.
Constructs and saves unique ids and 20_itemIDs for use with ebay_api
methods
'''
2021-12-01 01:32:01 +00:00
try:
2021-12-01 20:43:14 +00:00
with open('ids.txt') as f:
2021-12-01 01:32:01 +00:00
ids = json.load(f)
except FileNotFoundError:
ids = []
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(get_ids, urls):
ids.extend(future)
2021-12-01 01:32:01 +00:00
2021-12-01 20:43:14 +00:00
ids = list(set(ids)) # necessary; two links are returned with pattern match
item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
2021-12-01 01:32:01 +00:00
2021-12-01 20:43:14 +00:00
with open('ids.txt', 'w') as f:
json.dump(ids,f)
2021-12-01 01:32:01 +00:00
with open('item_id_results.txt', 'w') as f:
json.dump(item_id_results, f)
return item_id_results
2021-12-01 01:32:01 +00:00
2021-12-01 20:43:14 +00:00
def id_count():
'''
Counts Unique IDs of item_id_results for testing
'''
with open('item_id_results.txt') as f:
item_id_results = json.load(f)
ids = ','.join(item_id_results)
ids = ids.split(',')
uniq = len(list(set(ids)))
print('{} Unique IDs'.format(uniq))
return ids
def main():
urls = threaded_urls()
item_id_results = threaded_get_ids(urls)
return item_id_results
if __name__=="__main__":
main()