poshmark/posh.py
2020-06-11 17:21:40 -07:00

122 lines
5.0 KiB
Python

import requests
from bs4 import BeautifulSoup as b
import time
import re
import concurrent.futures
import numpy as np
# import matplotlib.pyplot as plt
def url_base_builder(search_query):
genders = ['Men', 'Women']
posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
for i in range(0, len(posh_colors)):
if posh_colors[i] in search_query:
url_color = '&color[]=' + posh_colors[i]
color = posh_colors[i]
break
else:
color = ''
url_color = ''
for i in range(0, len(genders)):
if genders[i] in search_query:
url_gender = '&department=' + genders[i]
gender = genders[i]
break
else:
gender = ''
url_gender = '&department=All'
sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
"&availability=sold_out" + url_color + url_gender + '&max_id='
new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
return all_sold_url_base, new_sold_url_base
def all_sold_list_builder(i):
bases = url_base_builder(search_query)
all_sold_url_base = bases[0]
all_sold_prices = []
url = all_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
all_sold_prices.append(price)
return all_sold_prices
def new_sold_list_builder(i):
bases = url_base_builder(search_query)
new_sold_url_base = bases[1]
new_sold_prices = []
url = new_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
new_sold_prices.append(price)
return new_sold_prices
search_query = str(input('Title Search: '))
start = time.time()
page_list = list(range(1, 2))
all_sold_list = []
new_sold_list = []
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(all_sold_list_builder, page_list):
all_sold_list.extend(future)
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(new_sold_list_builder, page_list):
new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this
for element in new_sold_list:
all_sold_list.remove(element)
used_sold_list = all_sold_list
average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))
average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))
used_sold_results = str(len(used_sold_list)) + ' Used Results'
new_sold_results = str(len(new_sold_list)) + ' NWT Results'
total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'
end = time.time()
print(end - start, 'seconds')
print('Average Used Sold Price', average_used_sold_price, used_sold_results)
print('Average New Sold Price', average_new_sold_price, new_sold_results)
print(total_results)
'''There has to be a way to determine the number of pages present prior to making far too many requests. Look at network in the element inspector to see if there might be some kind of id that gives away the page type. some responses might be different other than in their html code. Otherwise you can maybe determine a threshold payload; so if a payload is smaller than so many kb then you can block it.
This will be significant to speeding up your programming. If you keep having to make requests to 20 pages and wait on the results it may not be that much faster than just using pms.py, but if you can limit it only to what is absolutely required then that would be best. Also You nee to see if multiprocessing would be best for crunching all the prices in the lists.
Another workaround is by just doing a loop but in chunks of more than one page. so make your first list 1-4, send that to the multithreadpool, then take the next 4 pages and do the same thing until you get no listings found at which point you will stop the while loop. and you could even have the multithreader use separate page lists for the new and all . '''