2020-05-26 01:36:44 +00:00
import requests
from bs4 import BeautifulSoup as b
import time
import re
import concurrent . futures
import numpy as np
2020-06-12 00:21:40 +00:00
# import matplotlib.pyplot as plt
2020-05-26 01:36:44 +00:00
def url_base_builder ( search_query ) :
genders = [ ' Men ' , ' Women ' ]
2020-06-12 00:21:40 +00:00
posh_colors = [ ' Red ' , ' Pink ' , ' Orange ' , ' Yellow ' , ' Green ' , ' Blue ' , ' Purple ' ,
' Gold ' , ' Silver ' , ' Black ' , ' Gray ' , ' White ' , ' Cream ' , ' Brown ' , ' Tan ' ]
2020-05-26 01:36:44 +00:00
2020-06-12 00:21:40 +00:00
for i in range ( 0 , len ( posh_colors ) ) :
2020-05-26 01:36:44 +00:00
if posh_colors [ i ] in search_query :
url_color = ' &color[]= ' + posh_colors [ i ]
color = posh_colors [ i ]
break
else :
color = ' '
url_color = ' '
2020-06-12 00:21:40 +00:00
for i in range ( 0 , len ( genders ) ) :
2020-05-26 01:36:44 +00:00
if genders [ i ] in search_query :
url_gender = ' &department= ' + genders [ i ]
gender = genders [ i ]
break
else :
gender = ' '
url_gender = ' &department=All '
2020-06-12 00:21:40 +00:00
sq = search_query . replace ( color , ' ' ) . replace ( gender , ' ' ) . replace ( ' NEW ' , ' ' ) . replace ( ' ' , ' + ' )
2020-05-26 01:36:44 +00:00
2020-06-12 00:21:40 +00:00
all_sold_url_base = ' https://poshmark.com/search?query= ' + sq + \
" &availability=sold_out " + url_color + url_gender + ' &max_id= '
2020-05-26 01:36:44 +00:00
2020-06-12 00:21:40 +00:00
new_sold_url_base = ' https://poshmark.com/search?query= ' + sq + ' &availability=sold_out ' + \
' &condition=nwt_and_ret ' + url_color + url_gender + ' &max_id= '
2020-05-26 01:36:44 +00:00
return all_sold_url_base , new_sold_url_base
2020-06-12 00:21:40 +00:00
2020-05-26 01:36:44 +00:00
def all_sold_list_builder ( i ) :
bases = url_base_builder ( search_query )
all_sold_url_base = bases [ 0 ]
all_sold_prices = [ ]
url = all_sold_url_base + str ( i )
html = requests . get ( url ) . text
2020-06-12 00:21:40 +00:00
soup = b ( html , " lxml " )
# last_page = soup.find(string = re.compile('No Listings Found'))
for price in soup . find_all ( ' span ' , { ' class ' : ' p--t--1 fw--bold ' } ) :
2020-05-26 01:36:44 +00:00
price = price . get_text ( )
dollar_index = price . find ( ' $ ' )
price = price [ dollar_index + 1 : ]
space = price . find ( ' ' )
price = int ( price [ : space - 1 ] )
all_sold_prices . append ( price )
return all_sold_prices
2020-06-12 00:21:40 +00:00
2020-05-26 01:36:44 +00:00
def new_sold_list_builder ( i ) :
bases = url_base_builder ( search_query )
new_sold_url_base = bases [ 1 ]
new_sold_prices = [ ]
url = new_sold_url_base + str ( i )
html = requests . get ( url ) . text
2020-06-12 00:21:40 +00:00
soup = b ( html , " lxml " )
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
2020-05-26 01:36:44 +00:00
2020-06-12 00:21:40 +00:00
for price in soup . find_all ( ' span ' , { ' class ' : ' p--t--1 fw--bold ' } ) :
2020-05-26 01:36:44 +00:00
price = price . get_text ( )
dollar_index = price . find ( ' $ ' )
price = price [ dollar_index + 1 : ]
space = price . find ( ' ' )
price = int ( price [ : space - 1 ] )
new_sold_prices . append ( price )
return new_sold_prices
2020-06-12 00:21:40 +00:00
2020-06-12 00:28:56 +00:00
def main ( ) :
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
search_query = str ( input ( ' Title Search: ' ) )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
start = time . time ( )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
page_list = list ( range ( 1 , 2 ) )
all_sold_list = [ ]
new_sold_list = [ ]
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
for future in executor . map ( all_sold_list_builder , page_list ) :
all_sold_list . extend ( future )
2020-06-12 00:21:40 +00:00
2020-06-12 00:28:56 +00:00
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
for future in executor . map ( new_sold_list_builder , page_list ) :
new_sold_list . extend ( future ) # if you can pull the nwt price simultaneously with used then you won't have to use this
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
for element in new_sold_list :
all_sold_list . remove ( element )
used_sold_list = all_sold_list
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
average_used_sold_price = ' $ ' + str ( round ( np . mean ( used_sold_list ) , 2 ) )
average_new_sold_price = ' $ ' + str ( round ( np . mean ( new_sold_list ) , 2 ) )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
used_sold_results = str ( len ( used_sold_list ) ) + ' Used Results '
new_sold_results = str ( len ( new_sold_list ) ) + ' NWT Results '
total_results = str ( len ( used_sold_list ) + len ( new_sold_list ) ) + ' Total Results '
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
end = time . time ( )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
print ( end - start , ' seconds ' )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
print ( ' Average Used Sold Price ' , average_used_sold_price , used_sold_results )
print ( ' Average New Sold Price ' , average_new_sold_price , new_sold_results )
print ( total_results )
2020-05-26 01:36:44 +00:00
2020-06-12 00:28:56 +00:00
main ( )
2020-06-12 00:54:12 +00:00
''' to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don ' t have to make more requests than necessary. This could be either taking the " smallest " " no listings found " page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3 '''