added all defs to main def

This commit is contained in:
spbeach46 2020-06-11 18:26:32 -07:00
parent f40825139b
commit 869a4613fb

148
posh.py
View File

@ -7,84 +7,83 @@ import numpy as np
# import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
def url_base_builder(search_query):
genders = ['Men', 'Women']
posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
for i in range(0, len(posh_colors)):
if posh_colors[i] in search_query:
url_color = '&color[]=' + posh_colors[i]
color = posh_colors[i]
break
else:
color = ''
url_color = ''
for i in range(0, len(genders)):
if genders[i] in search_query:
url_gender = '&department=' + genders[i]
gender = genders[i]
break
else:
gender = ''
url_gender = '&department=All'
sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
"&availability=sold_out" + url_color + url_gender + '&max_id='
new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
return all_sold_url_base, new_sold_url_base
def all_sold_list_builder(i):
bases = url_base_builder(search_query)
all_sold_url_base = bases[0]
all_sold_prices = []
url = all_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
all_sold_prices.append(price)
return all_sold_prices
def new_sold_list_builder(i):
bases = url_base_builder(search_query)
new_sold_url_base = bases[1]
new_sold_prices = []
url = new_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
new_sold_prices.append(price)
return new_sold_prices
def main(): def main():
def url_base_builder(search_query):
genders = ['Men', 'Women']
posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
for i in range(0, len(posh_colors)):
if posh_colors[i] in search_query:
url_color = '&color[]=' + posh_colors[i]
color = posh_colors[i]
break
else:
color = ''
url_color = ''
for i in range(0, len(genders)):
if genders[i] in search_query:
url_gender = '&department=' + genders[i]
gender = genders[i]
break
else:
gender = ''
url_gender = '&department=All'
sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
"&availability=sold_out" + url_color + url_gender + '&max_id='
new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
return all_sold_url_base, new_sold_url_base
def all_sold_list_builder(i):
bases = url_base_builder(search_query)
all_sold_url_base = bases[0]
all_sold_prices = []
url = all_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
all_sold_prices.append(price)
return all_sold_prices
def new_sold_list_builder(i):
bases = url_base_builder(search_query)
new_sold_url_base = bases[1]
new_sold_prices = []
url = new_sold_url_base + str(i)
html = requests.get(url).text
soup = b(html, "lxml")
# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
price = price.get_text()
dollar_index = price.find('$')
price = price[dollar_index+1:]
space = price.find(' ')
price = int(price[:space-1])
new_sold_prices.append(price)
return new_sold_prices
search_query = str(input('Title Search: ')) search_query = str(input('Title Search: '))
start = time.time() start = time.time()
page_list = list(range(1, 2)) page_list = list(range(1, 5))
all_sold_list = [] all_sold_list = []
new_sold_list = [] new_sold_list = []
@ -116,5 +115,8 @@ def main():
print('Average New Sold Price', average_new_sold_price, new_sold_results) print('Average New Sold Price', average_new_sold_price, new_sold_results)
print(total_results) print(total_results)
main() if __name__ == '__main__':
main()
'''to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don't have to make more requests than necessary. This could be either taking the "smallest" "no listings found" page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3''' '''to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don't have to make more requests than necessary. This could be either taking the "smallest" "no listings found" page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3'''