poshmark/posh.py

120 lines
4.0 KiB
Python
Raw Normal View History

2020-05-26 01:36:44 +00:00
import requests
from bs4 import BeautifulSoup as b
import time
import re
import concurrent.futures
import numpy as np
# import matplotlib.pyplot as plt
2020-05-26 01:36:44 +00:00
def url_base_builder(search_query):
genders = ['Men', 'Women']
posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
2020-05-26 01:36:44 +00:00
for i in range(0, len(posh_colors)):
2020-05-26 01:36:44 +00:00
if posh_colors[i] in search_query:
url_color = '&color[]=' + posh_colors[i]
color = posh_colors[i]
break
else:
color = ''
url_color = ''
for i in range(0, len(genders)):
2020-05-26 01:36:44 +00:00
if genders[i] in search_query:
url_gender = '&department=' + genders[i]
gender = genders[i]
break
else:
gender = ''
url_gender = '&department=All'
sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
2020-05-26 01:36:44 +00:00
all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
"&availability=sold_out" + url_color + url_gender + '&max_id='
2020-05-26 01:36:44 +00:00
new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
2020-05-26 01:36:44 +00:00
return all_sold_url_base, new_sold_url_base
2020-05-26 01:36:44 +00:00
def all_sold_list_builder(i):
bases = url_base_builder(search_query)
all_sold_url_base = bases[0]
all_sold_prices = []
url = all_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
2020-05-26 01:36:44 +00:00
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
all_sold_prices.append(price)
return all_sold_prices
2020-05-26 01:36:44 +00:00
def new_sold_list_builder(i):
bases = url_base_builder(search_query)
new_sold_url_base = bases[1]
new_sold_prices = []
url = new_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
2020-05-26 01:36:44 +00:00
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
2020-05-26 01:36:44 +00:00
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
new_sold_prices.append(price)
return new_sold_prices
def main():
2020-05-26 01:36:44 +00:00
search_query = str(input('Title Search: '))
2020-05-26 01:36:44 +00:00
start = time.time()
2020-05-26 01:36:44 +00:00
page_list = list(range(1, 2))
all_sold_list = []
new_sold_list = []
2020-05-26 01:36:44 +00:00
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(all_sold_list_builder, page_list):
all_sold_list.extend(future)
with concurrent.futures.ThreadPoolExecutor() as executor:
for future in executor.map(new_sold_list_builder, page_list):
new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this
2020-05-26 01:36:44 +00:00
for element in new_sold_list:
all_sold_list.remove(element)
used_sold_list = all_sold_list
2020-05-26 01:36:44 +00:00
average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))
average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))
2020-05-26 01:36:44 +00:00
used_sold_results = str(len(used_sold_list)) + ' Used Results'
new_sold_results = str(len(new_sold_list)) + ' NWT Results'
total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'
2020-05-26 01:36:44 +00:00
end = time.time()
2020-05-26 01:36:44 +00:00
print(end - start, 'seconds')
2020-05-26 01:36:44 +00:00
print('Average Used Sold Price', average_used_sold_price, used_sold_results)
print('Average New Sold Price', average_new_sold_price, new_sold_results)
print(total_results)
2020-05-26 01:36:44 +00:00
main()