added comments. changed html.parser to lxml

2020-06-11 17:21:40 -07:00
parent 6fd57ec6b9
commit 9d9d72e8d0
2 changed files with 27 additions and 18 deletions
--- a/1
+++ b/1
--- a/posh.py
+++ b/posh.py
@@ -4,14 +4,16 @@ import time
 import re
 import concurrent.futures
 import numpy as np
-import matplotlib.pyplot as plt
+# import matplotlib.pyplot as plt
+

 def url_base_builder(search_query):
    genders = ['Men', 'Women']

-    posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple', 'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
+    posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
+                   'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']

-    for i in range(0,len(posh_colors)):
+    for i in range(0, len(posh_colors)):
        if posh_colors[i] in search_query:
            url_color = '&color[]=' + posh_colors[i]
            color = posh_colors[i]
@@ -19,7 +21,7 @@ def url_base_builder(search_query):
        else:
            color = ''
            url_color = ''
-    for i in range(0,len(genders)):
+    for i in range(0, len(genders)):
        if genders[i] in search_query:
            url_gender = '&department=' + genders[i]
            gender = genders[i]
@@ -28,23 +30,26 @@ def url_base_builder(search_query):
            gender = ''
            url_gender = '&department=All'

-    sq = search_query.replace(color,'').replace(gender,'').replace('NEW','').replace(' ', '+')
+    sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')

-    all_sold_url_base = 'https://poshmark.com/search?query=' + sq + "&availability=sold_out" + url_color + url_gender + '&max_id='
+    all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
+        "&availability=sold_out" + url_color + url_gender + '&max_id='

-    new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + '&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
+    new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
+        '&condition=nwt_and_ret' + url_color + url_gender + '&max_id='

    return all_sold_url_base, new_sold_url_base

+
 def all_sold_list_builder(i):
    bases = url_base_builder(search_query)
    all_sold_url_base = bases[0]
    all_sold_prices = []
    url = all_sold_url_base + str(i)
    html = requests.get(url).text
-    soup = b(html, "html.parser")
-    #last_page = soup.find(string = re.compile('No Listings Found'))
-    for price in soup.find_all('span', {'class':'p--t--1 fw--bold'}):
+    soup = b(html, "lxml")
+    # last_page = soup.find(string = re.compile('No Listings Found'))
+    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
@@ -53,16 +58,17 @@ def all_sold_list_builder(i):
        all_sold_prices.append(price)
    return all_sold_prices

+
 def new_sold_list_builder(i):
    bases = url_base_builder(search_query)
    new_sold_url_base = bases[1]
    new_sold_prices = []
    url = new_sold_url_base + str(i)
    html = requests.get(url).text
-    soup = b(html, "html.parser")
-    #last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
+    soup = b(html, "lxml")
+    # last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement

-    for price in soup.find_all('span', {'class':'p--t--1 fw--bold'}):
+    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
@@ -71,11 +77,12 @@ def new_sold_list_builder(i):
        new_sold_prices.append(price)
    return new_sold_prices

+
 search_query = str(input('Title Search: '))

 start = time.time()

-page_list = list(range(1, 13))
+page_list = list(range(1, 2))
 all_sold_list = []
 new_sold_list = []

@@ -85,7 +92,8 @@ with concurrent.futures.ThreadPoolExecutor() as executor:

 with concurrent.futures.ThreadPoolExecutor() as executor:
    for future in executor.map(new_sold_list_builder, page_list):
-        new_sold_list.extend(future)
+        new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this
+

 for element in new_sold_list:
    all_sold_list.remove(element)
@@ -94,8 +102,8 @@ used_sold_list = all_sold_list
 average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))
 average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))

-used_sold_results = str(len(used_sold_list)) + ' Results'
-new_sold_results = str(len(new_sold_list)) + ' Results'
+used_sold_results = str(len(used_sold_list)) + ' Used Results'
+new_sold_results = str(len(new_sold_list)) + ' NWT Results'
 total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'

 end = time.time()