poshmark/posh.py

import requests
from bs4 import BeautifulSoup as b
import time
import re
import concurrent.futures
import numpy as np
# import matplotlib.pyplot as plt


def url_base_builder(search_query):
    genders = ['Men', 'Women']

    posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',
                   'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']

    for i in range(0, len(posh_colors)):
        if posh_colors[i] in search_query:
            url_color = '&color[]=' + posh_colors[i]
            color = posh_colors[i]
            break
        else:
            color = ''
            url_color = ''
    for i in range(0, len(genders)):
        if genders[i] in search_query:
            url_gender = '&department=' + genders[i]
            gender = genders[i]
            break
        else:
            gender = ''
            url_gender = '&department=All'

    sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')

    all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \
        "&availability=sold_out" + url_color + url_gender + '&max_id='

    new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \
        '&condition=nwt_and_ret' + url_color + url_gender + '&max_id='

    return all_sold_url_base, new_sold_url_base


def all_sold_list_builder(i):
    bases = url_base_builder(search_query)
    all_sold_url_base = bases[0]
    all_sold_prices = []
    url = all_sold_url_base + str(i)
    html = requests.get(url).text
    soup = b(html, "lxml")
    # last_page = soup.find(string = re.compile('No Listings Found'))
    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
        space = price.find(' ')
        price = int(price[:space-1])
        all_sold_prices.append(price)
    return all_sold_prices


def new_sold_list_builder(i):
    bases = url_base_builder(search_query)
    new_sold_url_base = bases[1]
    new_sold_prices = []
    url = new_sold_url_base + str(i)
    html = requests.get(url).text
    soup = b(html, "lxml")
    # last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement

    for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):
        price = price.get_text()
        dollar_index = price.find('$')
        price = price[dollar_index+1:]
        space = price.find(' ')
        price = int(price[:space-1])
        new_sold_prices.append(price)
    return new_sold_prices


def main():

    search_query = str(input('Title Search: '))

    start = time.time()

    page_list = list(range(1, 2))
    all_sold_list = []
    new_sold_list = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for future in executor.map(all_sold_list_builder, page_list):
            all_sold_list.extend(future)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for future in executor.map(new_sold_list_builder, page_list):
            new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this


    for element in new_sold_list:
        all_sold_list.remove(element)
    used_sold_list = all_sold_list

    average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))
    average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))

    used_sold_results = str(len(used_sold_list)) + ' Used Results'
    new_sold_results = str(len(new_sold_list)) + ' NWT Results'
    total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'

    end = time.time()

    print(end - start, 'seconds')

    print('Average Used Sold Price', average_used_sold_price, used_sold_results)
    print('Average New Sold Price', average_new_sold_price, new_sold_results)
    print(total_results)

main()
'''to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don't have to make more requests than necessary. This could be either taking the "smallest" "no listings found" page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3'''
Iitial Commit 2020-05-26 01:36:44 +00:00			`import requests`
			`from bs4 import BeautifulSoup as b`
			`import time`
			`import re`
			`import concurrent.futures`
			`import numpy as np`
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`# import matplotlib.pyplot as plt`

Iitial Commit 2020-05-26 01:36:44 +00:00
			`def url_base_builder(search_query):`
			`genders = ['Men', 'Women']`

added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`posh_colors = ['Red', 'Pink', 'Orange', 'Yellow', 'Green', 'Blue', 'Purple',`
			`'Gold', 'Silver', 'Black', 'Gray', 'White', 'Cream', 'Brown', 'Tan']`
Iitial Commit 2020-05-26 01:36:44 +00:00
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`for i in range(0, len(posh_colors)):`
Iitial Commit 2020-05-26 01:36:44 +00:00			`if posh_colors[i] in search_query:`
			`url_color = '&color[]=' + posh_colors[i]`
			`color = posh_colors[i]`
			`break`
			`else:`
			`color = ''`
			`url_color = ''`
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`for i in range(0, len(genders)):`
Iitial Commit 2020-05-26 01:36:44 +00:00			`if genders[i] in search_query:`
			`url_gender = '&department=' + genders[i]`
			`gender = genders[i]`
			`break`
			`else:`
			`gender = ''`
			`url_gender = '&department=All'`

added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`sq = search_query.replace(color, '').replace(gender, '').replace('NEW', '').replace(' ', '+')`
Iitial Commit 2020-05-26 01:36:44 +00:00
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`all_sold_url_base = 'https://poshmark.com/search?query=' + sq + \`
			`"&availability=sold_out" + url_color + url_gender + '&max_id='`
Iitial Commit 2020-05-26 01:36:44 +00:00
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`new_sold_url_base = 'https://poshmark.com/search?query=' + sq + '&availability=sold_out' + \`
			`'&condition=nwt_and_ret' + url_color + url_gender + '&max_id='`
Iitial Commit 2020-05-26 01:36:44 +00:00
			`return all_sold_url_base, new_sold_url_base`

added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00
Iitial Commit 2020-05-26 01:36:44 +00:00			`def all_sold_list_builder(i):`
			`bases = url_base_builder(search_query)`
			`all_sold_url_base = bases[0]`
			`all_sold_prices = []`
			`url = all_sold_url_base + str(i)`
			`html = requests.get(url).text`
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`soup = b(html, "lxml")`
			`# last_page = soup.find(string = re.compile('No Listings Found'))`
			`for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):`
Iitial Commit 2020-05-26 01:36:44 +00:00			`price = price.get_text()`
			`dollar_index = price.find('$')`
			`price = price[dollar_index+1:]`
			`space = price.find(' ')`
			`price = int(price[:space-1])`
			`all_sold_prices.append(price)`
			`return all_sold_prices`

added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00
Iitial Commit 2020-05-26 01:36:44 +00:00			`def new_sold_list_builder(i):`
			`bases = url_base_builder(search_query)`
			`new_sold_url_base = bases[1]`
			`new_sold_prices = []`
			`url = new_sold_url_base + str(i)`
			`html = requests.get(url).text`
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`soup = b(html, "lxml")`
			`# last_page = soup.find(string = re.compile('No Listings Found'))#this is present in all pages that don't have a full 48 listings on them. So you end up with an empty price list becuase of your conditional statement`
Iitial Commit 2020-05-26 01:36:44 +00:00
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00			`for price in soup.find_all('span', {'class': 'p--t--1 fw--bold'}):`
Iitial Commit 2020-05-26 01:36:44 +00:00			`price = price.get_text()`
			`dollar_index = price.find('$')`
			`price = price[dollar_index+1:]`
			`space = price.find(' ')`
			`price = int(price[:space-1])`
			`new_sold_prices.append(price)`
			`return new_sold_prices`

added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`def main():`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`search_query = str(input('Title Search: '))`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`start = time.time()`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`page_list = list(range(1, 2))`
			`all_sold_list = []`
			`new_sold_list = []`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`for future in executor.map(all_sold_list_builder, page_list):`
			`all_sold_list.extend(future)`
added comments. changed html.parser to lxml 2020-06-12 00:21:40 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`for future in executor.map(new_sold_list_builder, page_list):`
			`new_sold_list.extend(future)# if you can pull the nwt price simultaneously with used then you won't have to use this`
Iitial Commit 2020-05-26 01:36:44 +00:00

removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`for element in new_sold_list:`
			`all_sold_list.remove(element)`
			`used_sold_list = all_sold_list`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`average_used_sold_price = '$' + str(round(np.mean(used_sold_list), 2))`
			`average_new_sold_price = '$' + str(round(np.mean(new_sold_list), 2))`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`used_sold_results = str(len(used_sold_list)) + ' Used Results'`
			`new_sold_results = str(len(new_sold_list)) + ' NWT Results'`
			`total_results = str(len(used_sold_list) + len(new_sold_list)) + ' Total Results'`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`end = time.time()`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`print(end - start, 'seconds')`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`print('Average Used Sold Price', average_used_sold_price, used_sold_results)`
			`print('Average New Sold Price', average_new_sold_price, new_sold_results)`
			`print(total_results)`
Iitial Commit 2020-05-26 01:36:44 +00:00
removed last comment and added main() function 2020-06-12 00:28:56 +00:00			`main()`
added workthrough comments at bottom of page 2020-06-12 00:54:12 +00:00			'''to speed up the program you can include a few things: 1) only parse the total results and sift for the NWT listings to create a separate NWT list 2) Implement processpoolexecutor to use more than one worker to parse the pages 3) find a better way to find the last page so you don't have to make more requests than necessary. This could be either taking the "smallest" "no listings found" page of the pages while excluding the others after the smallest one is found. Or, determining from the request headers whether a page is worth downloading or not 4) using a while loop in chunks of 2-4 pages to find the last page in conjunction with number 3'''