deleted curate defs that work from json directly instead of pandas df
This commit is contained in:
parent
92516ff4f0
commit
d09090dc13
86
ebay_api.py
86
ebay_api.py
@ -14,11 +14,11 @@ class FindingApi:
|
||||
'findItemsByProduct'
|
||||
][service]
|
||||
self.pageNumber = list(range(1, pageNumber)) # 64 pages is recommended
|
||||
# this will give equal weights to cats given call constraints
|
||||
# as this will give equal weights to cats given call constraints
|
||||
|
||||
# departments = ["3034","93427"] (womens and mens)
|
||||
|
||||
def get_data(self, category_id, i): # TODO you're going to have to use nested functions of lambda functions here somewhere
|
||||
def get_data(self, category_id, i):
|
||||
|
||||
'''
|
||||
Gets raw JSON data fom FindingApi service call
|
||||
@ -77,7 +77,7 @@ class FindingApi:
|
||||
len(itemid_results_list), 20))]
|
||||
return item_id_results
|
||||
|
||||
## TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
|
||||
# TODO instead of running through multiple try except loops try to implement set methods for efficiency and ease. Remember symmetric_difference, difference, intersection, set()
|
||||
# for category_id in cat_list:
|
||||
|
||||
class ShoppingApi:
|
||||
@ -126,35 +126,6 @@ class CurateData:
|
||||
Takes item in data from ShoppingApi request as argument and extracts/ creates key
|
||||
value pairs that gets updated to custom dataframe used in Ml training sets.
|
||||
'''
|
||||
def extract_itemId(self, item):
|
||||
item_id = {'ItemID':item['ItemID']}
|
||||
return item_id
|
||||
|
||||
def extract_catId(self, item):
|
||||
catId = {'PrimaryCategoryID':item['PrimaryCategoryID']}
|
||||
return catId
|
||||
|
||||
def extract_prime_cat_name(self, item):
|
||||
prime_cat_name= {'PrimaryCategoryName':item['PrimaryCategoryName']}
|
||||
return prime_cat_name
|
||||
|
||||
def extract_picture_url(self, item):
|
||||
'''
|
||||
Only pulls PictureURL list and does not
|
||||
create dictionary
|
||||
'''
|
||||
picture_url_list = item['PictureURL']
|
||||
return picture_url_list
|
||||
|
||||
def extract_nvl(self, item):
|
||||
names = []
|
||||
values = []
|
||||
nvl = item['ItemSpecifics']['NameValueList']
|
||||
for nvl_dict in nvl:
|
||||
names.append(nvl_dict['Name'])
|
||||
values.append(nvl_dict['Value'])
|
||||
nvl_dict = dict(zip(names, values))
|
||||
return nvl_dict
|
||||
|
||||
def update_df(self, data):
|
||||
'''
|
||||
@ -164,33 +135,20 @@ class CurateData:
|
||||
per listing, each picture will be its own training instance.
|
||||
'''
|
||||
|
||||
training = {}
|
||||
for item in data:
|
||||
# TODO MAY HAVE TO DISCARD THIS IDEA DUE TO CRAPPY PICTURES OF CLOSEUPDS AND TAGS. may have to settle for first picture which is likely to contain more accurate representation of item.
|
||||
picture_url_list = self.extract_picture_url(item)
|
||||
'''
|
||||
Creates same training instance per photo for
|
||||
'''
|
||||
# for url in picture_url_list: # maybe try removing for loop to see if csv updates correctly here
|
||||
# remote_url = {'PictureURL':url}
|
||||
# training.update(remote_url)
|
||||
item_id = self.extract_itemId(item)
|
||||
training.update(item_id)
|
||||
catId = self.extract_catId(item)
|
||||
training.update(catId)
|
||||
prime_cat_name = self.extract_prime_cat_name(item)
|
||||
training.update(prime_cat_name)
|
||||
nvl_dict = self.extract_nvl(item)
|
||||
training.update(nvl_dict)
|
||||
|
||||
df = pd.json_normalize(training) # TODO FIX INDENT HERE?
|
||||
#df.to_csv('training.csv', mode='a')
|
||||
print(training) # after looking at the training output it looks like everything might be out of order due possibly to multithreading issues. Due to this you may have to use a more finegrained
|
||||
# multithreading module
|
||||
def data_frame(self, data):
|
||||
to_json = json.dumps(data)
|
||||
raw_df = pd.read_json(to_json)
|
||||
return raw_df
|
||||
|
||||
def to_training(self):
|
||||
raw_df = self.data_frame(data)
|
||||
interm_df1 = raw_df.loc[:, ['ItemID', 'PictureURL', 'PrimaryCategoryID', 'PrimaryCategoryName', 'Title', 'ItemSpecifics']]
|
||||
interm_df1[['ItemID', 'PrimaryCAegoryID']] = interm_df[['ItemID', 'PrimaryCategoryID']].astype(str)
|
||||
# USE raw_df.loc[:, ['col1', col2', 'col3', 'etc']] for creating new df. There may be another way though.
|
||||
|
||||
# USE pd.concat([1st df, 2nd df], sort=False) to combine dfs and later into larger csv files. You can transform each new raw_df first before combining it with the previous transformed
|
||||
# df. then you can take the raw_df and combine it with the old raw_df for backup.
|
||||
|
||||
# TODO You will have to mess around more with pandas df to find a better solution to creating your csv file: i.e., create dataframe from from instances, run through process to customize your df
|
||||
# for final training set for your ml model training. Contemplate on the future... you want ability to update main csv AND training csv; one for updating raw data instances from search queries, and
|
||||
# the other for updating your training set.
|
||||
@ -201,17 +159,8 @@ def main():
|
||||
Main program creates/updates a csv file to use for ML training from live
|
||||
ebay listings
|
||||
'''
|
||||
# service, pageNumber = input('service and pageNumber:').split()
|
||||
# service = int(service)
|
||||
# pageNumber = int(pageNumber)
|
||||
# finding = FindingApi(service, pageNumber)
|
||||
|
||||
# item_id_results = finding.get_ids_from_cats()
|
||||
shopping = ShoppingApi()
|
||||
data = shopping.conky()
|
||||
curate = CurateData()
|
||||
curate.update_df(data)
|
||||
return data
|
||||
pass
|
||||
# main goes here:
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -221,11 +170,6 @@ if __name__ == "__main__":
|
||||
# to divide these up into the categories. This will leave you with about 6.25K results per cat.
|
||||
# More than enough data for your dataset.
|
||||
|
||||
# Need to make sure dataframe gets important stuff outside of nvl in order to
|
||||
# access values for cross referencing itemIds from calls
|
||||
# Need to decide if list gets accessed from df or if you're just going to have
|
||||
# list contents extracted and possibly placed into separate cells/labels
|
||||
|
||||
# TODO NEED TO ADD TRY EXCEPT CONDITIONS FOR EVERY CALL MADE TO API SERVICES TO
|
||||
# TO AVOID HICCUPS WHEN CREATING DATASET
|
||||
# TODO YOU WILL HAVE TO FIND A WAY OF COLLECTING DATA FOR IMAGES OF TAGS EITHER USING YOUR OWN TAGS OR SOMEHOW FIND A WAY TO FIND TAGS ON OTHERS LISTINGS. CRUCIAL FOR THE LISTINGS PROCESS. May be as simple as adding a def to one of the apis to extract only the picture if it can identify what a tag looks like. So, it may actually be a good thing to include all the pictures in a training set but then when you're ready to begin training you'll have a data cleaning pipeline specific to training a model to either learn shoe features or information on tags.
|
||||
|
Loading…
Reference in New Issue
Block a user