added all defs to main def

2020-06-11 18:26:32 -07:00
parent f40825139b
commit 869a4613fb
1 changed files with 75 additions and 73 deletions
--- a/posh.py
+++ b/posh.py
@@ -7,84 +7,83 @@ import numpy as np
 # import matplotlib.pyplot as plt
 def url_base_builder(search_query):
    genders = ['Men', 'Women']
    posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
                   'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
    for i in range(0, len(posh_colors)):
        if posh_colors[i] in search_query:
            url_color = '&color[]=' + posh_colors[i]
            color = posh_colors[i]
            break
        else:
            color = ''
            url_color = ''
    for i in range(0, len(genders)):
        if genders[i] in search_query:
            url_gender = '&department=' + genders[i]
            gender = genders[i]
            break
        else:
            gender = ''
            url_gender = '&department=All'
    sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
    all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
        "&availability=sold_out" + url_color + url_gender + '&max_id='
    new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
        '&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
    return all_sold_url_base, new_sold_url_base
 def all_sold_list_builder(i):
    bases = url_base_builder(search_query)
    all_sold_url_base = bases[0]
    all_sold_prices = []
    url = all_sold_url_base + str(i)
    html = requests.get(url).text
    soup = b(html, "lxml")
    # last_page = soup.find(string = re.compile('No Listings Found'))
    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
        space = price.find(' ')
        price = int(price[:space-1])
        all_sold_prices.append(price)
    return all_sold_prices
 def new_sold_list_builder(i):
    bases = url_base_builder(search_query)
    new_sold_url_base = bases[1]
    new_sold_prices = []
    url = new_sold_url_base + str(i)
    html = requests.get(url).text
    soup = b(html, "lxml")
    # last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
        space = price.find(' ')
        price = int(price[:space-1])
        new_sold_prices.append(price)
    return new_sold_prices
 def main():
    def url_base_builder(search_query):
        genders = ['Men', 'Women']
        posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
                    'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']
        for i in range(0, len(posh_colors)):
            if posh_colors[i] in search_query:
                url_color = '&color[]=' + posh_colors[i]
                color = posh_colors[i]
                break
            else:
                color = ''
                url_color = ''
        for i in range(0, len(genders)):
            if genders[i] in search_query:
                url_gender = '&department=' + genders[i]
                gender = genders[i]
                break
            else:
                gender = ''
                url_gender = '&department=All'
        sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')
        all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
            "&availability=sold_out" + url_color + url_gender + '&max_id='
        new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
            '&condition=nwt_and_ret' + url_color + url_gender + '&max_id='
        return all_sold_url_base, new_sold_url_base
    def all_sold_list_builder(i):
        bases = url_base_builder(search_query)
        all_sold_url_base = bases[0]
        all_sold_prices = []
        url = all_sold_url_base + str(i)
        html = requests.get(url).text
        soup = b(html, "lxml")
        # last_page = soup.find(string = re.compile('No Listings Found'))
        for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
            price = price.get_text()
            dollar_index = price.find('$')
            price = price[dollar_index+1:]
            space = price.find(' ')
            price = int(price[:space-1])
            all_sold_prices.append(price)
        return all_sold_prices
    def new_sold_list_builder(i):
        bases = url_base_builder(search_query)
        new_sold_url_base = bases[1]
        new_sold_prices = []
        url = new_sold_url_base + str(i)
        html = requests.get(url).text
        soup = b(html, "lxml")
        # last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement
        for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
            price = price.get_text()
            dollar_index = price.find('$')
            price = price[dollar_index+1:]
            space = price.find(' ')
            price = int(price[:space-1])
            new_sold_prices.append(price)
        return new_sold_prices
    search_query = str(input('Title Search: '))
    start = time.time()
-    page_list = list(range(1, 2))
+    page_list = list(range(1, 5))
    all_sold_list = []
    new_sold_list = []
@@ -116,5 +115,8 @@ def main():
    print('Average New Sold Price', average_new_sold_price, new_sold_results)
    print(total_results)
-main()
+if __name__ == '__main__':
    main()
 '''to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don't have to make more requests than necessary. This could be either taking the "smallest" "no listings found" page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3'''