122 lines
5.0 KiB
Python
122 lines
5.0 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup as b
|
|
import time
|
|
import re
|
|
import concurrent.futures
|
|
import numpy as np
|
|
# import matplotlib.pyplot as plt
|
|
|
|
|
|
def url_base_builder(search_query):
|
|
genders = ['Men', 'Women']
|
|
|
|
posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
|
|
'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
|
|
|
|
for i in range(0, len(posh_colors)):
|
|
if posh_colors[i] in search_query:
|
|
url_color = '&color[]=' + posh_colors[i]
|
|
color = posh_colors[i]
|
|
break
|
|
else:
|
|
color = ''
|
|
url_color = ''
|
|
for i in range(0, len(genders)):
|
|
if genders[i] in search_query:
|
|
url_gender = '&department=' + genders[i]
|
|
gender = genders[i]
|
|
break
|
|
else:
|
|
gender = ''
|
|
url_gender = '&department=All'
|
|
|
|
sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
|
|
|
|
all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
|
|
"&availability=sold_out" + url_color + url_gender + '&max_id='
|
|
|
|
new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
|
|
'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
|
|
|
|
return all_sold_url_base, new_sold_url_base
|
|
|
|
|
|
def all_sold_list_builder(i):
|
|
bases = url_base_builder(search_query)
|
|
all_sold_url_base = bases[0]
|
|
all_sold_prices = []
|
|
url = all_sold_url_base + str(i)
|
|
html = requests.get(url).text
|
|
soup = b(html, "lxml")
|
|
# last_page = soup.find(string = re.compile('No Listings Found'))
|
|
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
|
|
price = price.get_text()
|
|
dollar_index = price.find('$')
|
|
price = price[dollar_index+1:]
|
|
space = price.find(' ')
|
|
price = int(price[:space-1])
|
|
all_sold_prices.append(price)
|
|
return all_sold_prices
|
|
|
|
|
|
def new_sold_list_builder(i):
|
|
bases = url_base_builder(search_query)
|
|
new_sold_url_base = bases[1]
|
|
new_sold_prices = []
|
|
url = new_sold_url_base + str(i)
|
|
html = requests.get(url).text
|
|
soup = b(html, "lxml")
|
|
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
|
|
|
|
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
|
|
price = price.get_text()
|
|
dollar_index = price.find('$')
|
|
price = price[dollar_index+1:]
|
|
space = price.find(' ')
|
|
price = int(price[:space-1])
|
|
new_sold_prices.append(price)
|
|
return new_sold_prices
|
|
|
|
|
|
search_query = str(input('Title Search: '))
|
|
|
|
start = time.time()
|
|
|
|
page_list = list(range(1, 2))
|
|
all_sold_list = []
|
|
new_sold_list = []
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(all_sold_list_builder, page_list):
|
|
all_sold_list.extend(future)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
for future in executor.map(new_sold_list_builder, page_list):
|
|
new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this
|
|
|
|
|
|
for element in new_sold_list:
|
|
all_sold_list.remove(element)
|
|
used_sold_list = all_sold_list
|
|
|
|
average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))
|
|
average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))
|
|
|
|
used_sold_results = str(len(used_sold_list)) + ' Used Results'
|
|
new_sold_results = str(len(new_sold_list)) + ' NWT Results'
|
|
total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'
|
|
|
|
end = time.time()
|
|
|
|
print(end - start, 'seconds')
|
|
|
|
print('Average Used Sold Price', average_used_sold_price, used_sold_results)
|
|
print('Average New Sold Price', average_new_sold_price, new_sold_results)
|
|
print(total_results)
|
|
|
|
'''There has to be a way to determine the number of pages present prior to making far too many requests. Look at network in the element inspector to see if there might be some kind of id that gives away the page type. some responses might be different other than in their html code. Otherwise you can maybe determine a threshold payload; so if a payload is smaller than so many kb then you can block it.
|
|
|
|
This will be significant to speeding up your programming. If you keep having to make requests to 20 pages and wait on the results it may not be that much faster than just using pms.py, but if you can limit it only to what is absolutely required then that would be best. Also You nee to see if multiprocessing would be best for crunching all the prices in the lists.
|
|
|
|
Another workaround is by just doing a loop but in chunks of more than one page. so make your first list 1-4, send that to the multithreadpool, then take the next 4 pages and do the same thing until you get no listings found at which point you will stop the while loop. and you could even have the multithreader use separate page lists for the new and all . '''
|