2021-11-26 01:31:54 +00:00
|
|
|
from bs4 import BeautifulSoup as b
|
|
|
|
import re
|
|
|
|
import json
|
|
|
|
import requests
|
|
|
|
import concurrent.futures
|
|
|
|
import config as cfg
|
|
|
|
|
|
|
|
def get_isurl(category_id): # "get itemSearchURL"
|
|
|
|
|
|
|
|
'''
|
|
|
|
Gets raw JSON data fom FindingApi service call. Currently being used to
|
|
|
|
get itemIDs from categories;
|
|
|
|
'''
|
2021-11-27 01:46:51 +00:00
|
|
|
|
2021-11-26 01:31:54 +00:00
|
|
|
params = {
|
|
|
|
"OPERATION-NAME":'findItemsByCategory',
|
|
|
|
"SECURITY-APPNAME":cfg.sec['SECURITY-APPNAME'],
|
|
|
|
"SERVICE-VERSION":"1.13.0",
|
|
|
|
"RESPONSE-DATA-FORMAT":"JSON",
|
|
|
|
"categoryId":category_id,
|
|
|
|
"paginationInput.entriesPerPage":"1",
|
|
|
|
"paginationInput.PageNumber":1,
|
|
|
|
"itemFilter(0).name":"Condition",
|
|
|
|
"itemFilter(0).value":"Used",
|
|
|
|
"itemFilter.name":"HideDuplicateItems",
|
|
|
|
"itemFilter.value":"true",
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
response = requests.get("https://svcs.ebay.com/services/search/FindingService/v1",
|
|
|
|
params=params, timeout=24)
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
print('connection error')
|
|
|
|
return url
|
|
|
|
try:
|
|
|
|
data = response.json()
|
2021-11-27 01:46:51 +00:00
|
|
|
# NOTE approx 220 pages of listings per cat @ 35 items per page
|
2021-11-26 01:31:54 +00:00
|
|
|
item_cond = "&rt=nc&LH_ItemCondition=3000&mag=1" # preowned
|
2021-11-27 01:46:51 +00:00
|
|
|
urls = []
|
|
|
|
url = data['findItemsByCategoryResponse'][0]['itemSearchURL'][0]
|
2021-11-26 01:31:54 +00:00
|
|
|
url = url+item_cond
|
2021-11-27 01:46:51 +00:00
|
|
|
j = list(range(1,221))
|
|
|
|
for i in j:
|
|
|
|
pg = "&_pgn={}".format(str(i))
|
|
|
|
url = url.replace('&_pgn=1', pg)
|
|
|
|
urls.append(url)
|
|
|
|
|
2021-11-26 01:31:54 +00:00
|
|
|
except (AttributeError, KeyError):
|
|
|
|
print('AttributeError or KeyError. Exiting')
|
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
return urls
|
2021-11-26 01:31:54 +00:00
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
def threaded_urls():
|
2021-11-26 01:31:54 +00:00
|
|
|
|
|
|
|
urls = []
|
|
|
|
with open('cat_list.txt') as jf:
|
|
|
|
cat_list = json.load(jf)
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
for future in executor.map(get_isurl, cat_list):
|
2021-11-27 01:46:51 +00:00
|
|
|
urls.extend(future)
|
2021-11-26 01:31:54 +00:00
|
|
|
|
|
|
|
return urls
|
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
def get_ids(url):
|
2021-11-30 19:43:37 +00:00
|
|
|
'''
|
|
|
|
Scrapes listing links for item ID in url
|
|
|
|
'''
|
2021-11-26 01:31:54 +00:00
|
|
|
html = requests.get(url).text
|
|
|
|
soup = b(html, "html.parser")
|
2021-11-27 01:46:51 +00:00
|
|
|
ids = list(soup.find_all(href=re.compile(r"[\d]+(?=\?hash)")))
|
2021-11-26 01:31:54 +00:00
|
|
|
ids = [id['href'] for id in ids]
|
2021-11-27 01:46:51 +00:00
|
|
|
ids = [re.findall(r"[\d]+(?=\?)", id)[0] for id in ids]
|
2021-11-30 19:43:37 +00:00
|
|
|
ids = list(set(ids)) # necessary; two links are returned with pattern match
|
2021-11-27 01:46:51 +00:00
|
|
|
|
|
|
|
return ids
|
|
|
|
|
|
|
|
def threaded_get_ids(urls):
|
|
|
|
|
2021-12-01 01:32:01 +00:00
|
|
|
try:
|
|
|
|
with open('item_id_results.txt') as f:
|
|
|
|
ids = json.load(f)
|
|
|
|
except FileNotFoundError:
|
|
|
|
ids = []
|
2021-11-27 01:46:51 +00:00
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
for future in executor.map(get_ids, urls):
|
|
|
|
ids.extend(future)
|
2021-12-01 01:32:01 +00:00
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
item_id_results = [','.join(ids[n:n+20]) for n in list(range(0,
|
2021-11-30 19:43:37 +00:00
|
|
|
len(ids), 20))] # 20-ItemID list created to maximize dataset/decrease calls given call constraints
|
2021-12-01 01:32:01 +00:00
|
|
|
|
|
|
|
with open('item_id_results.txt', 'w') as f:
|
|
|
|
json.dump(item_id_results, f)
|
2021-11-27 01:46:51 +00:00
|
|
|
|
|
|
|
return item_id_results
|
2021-12-01 01:32:01 +00:00
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
def main():
|
|
|
|
urls = threaded_urls()
|
|
|
|
item_id_results = threaded_get_ids(urls)
|
|
|
|
return item_id_results
|
2021-11-26 01:31:54 +00:00
|
|
|
|
2021-11-27 01:46:51 +00:00
|
|
|
if __name__=="__main__":
|
|
|
|
main()
|