From 9d9d72e8d01e8af838addc13044f65a9372eadc4 Mon Sep 17 00:00:00 2001 From: spbeach46 Date: Thu, 11 Jun 2020 17:21:40 -0700 Subject: [PATCH] added comments. changed html.parser to lxml --- bulkpp | 1 + posh.py | 44 ++++++++++++++++++++++++++------------------ 2 files changed, 27 insertions(+), 18 deletions(-) create mode 160000 bulkpp diff --git a/bulkpp b/bulkpp new file mode 160000 index 0000000..32d3feb --- /dev/null +++ b/bulkpp @@ -0,0 +1 @@ +Subproject commit 32d3febba22b73b37ac188d84330ae789288c535 diff --git a/posh.py b/posh.py index 643ce9c..1405a0b 100644 --- a/posh.py +++ b/posh.py @@ -4,14 +4,16 @@ import time import re import concurrent.futures import numpy as np -import matplotlib.pyplot as plt +# import matplotlib.pyplot as plt + def url_base_builder(search_query): genders = ['Men', 'Women'] - posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple', 'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan'] + posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple', + 'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan'] - for i in range(0,len(posh_colors)): + for i in range(0, len(posh_colors)): if posh_colors[i] in search_query: url_color = '&color[]=' + posh_colors[i] color = posh_colors[i] @@ -19,7 +21,7 @@ def url_base_builder(search_query): else: color = '' url_color = '' - for i in range(0,len(genders)): + for i in range(0, len(genders)): if genders[i] in search_query: url_gender = '&department=' + genders[i] gender = genders[i] @@ -28,23 +30,26 @@ def url_base_builder(search_query): gender = '' url_gender = '&department=All' - sq = search_query.replace(color,'').replace(gender,'').replace('NEW','').replace(' ', '+') + sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+') - all_sold_url_base = 'https://poshmark.com/search?query=' + sq + "&availability=sold_out" + url_color + url_gender + '&max_id=' + all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \ + "&availability=sold_out" + url_color + url_gender + '&max_id=' - new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + '&condition=nwt_and_ret' + url_color + url_gender + '&max_id=' + new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \ + '&condition=nwt_and_ret' + url_color + url_gender + '&max_id=' return all_sold_url_base, new_sold_url_base + def all_sold_list_builder(i): bases = url_base_builder(search_query) all_sold_url_base = bases[0] all_sold_prices = [] url = all_sold_url_base + str(i) html = requests.get(url).text - soup = b(html, "html.parser") - #last_page = soup.find(string = re.compile('No Listings Found')) - for price in soup.find_all('span', {'class':'p--t--1 fw--bold'}): + soup = b(html, "lxml") + # last_page = soup.find(string = re.compile('No Listings Found')) + for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}): price = price.get_text() dollar_index = price.find('$') price = price[dollar_index+1:] @@ -53,16 +58,17 @@ def all_sold_list_builder(i): all_sold_prices.append(price) return all_sold_prices + def new_sold_list_builder(i): bases = url_base_builder(search_query) new_sold_url_base = bases[1] new_sold_prices = [] url = new_sold_url_base + str(i) html = requests.get(url).text - soup = b(html, "html.parser") - #last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement + soup = b(html, "lxml") + # last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement - for price in soup.find_all('span', {'class':'p--t--1 fw--bold'}): + for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}): price = price.get_text() dollar_index = price.find('$') price = price[dollar_index+1:] @@ -71,11 +77,12 @@ def new_sold_list_builder(i): new_sold_prices.append(price) return new_sold_prices + search_query = str(input('Title Search: ')) start = time.time() -page_list = list(range(1, 13)) +page_list = list(range(1, 2)) all_sold_list = [] new_sold_list = [] @@ -85,17 +92,18 @@ with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(new_sold_list_builder, page_list): - new_sold_list.extend(future) + new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this + for element in new_sold_list: - all_sold_list.remove(element) + all_sold_list.remove(element) used_sold_list = all_sold_list average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2)) average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2)) -used_sold_results = str(len(used_sold_list)) + ' Results' -new_sold_results = str(len(new_sold_list)) + ' Results' +used_sold_results = str(len(used_sold_list)) + ' Used Results' +new_sold_results = str(len(new_sold_list)) + ' NWT Results' total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results' end = time.time()