From ef237b8a1b3b3444c093ee3fbbd3d92c179d430a Mon Sep 17 00:00:00 2001
From: spbeach46 <spbeacg46@gmail.com>
Date: Wed, 7 Apr 2021 15:50:23 -0700
Subject: [PATCH] added dict_pics.txt updating for expanded_dfs method

---
 curate.py   |  3 ++-
 ebay_api.py | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/curate.py b/curate.py
index f6db7cd..7972644 100644
--- a/curate.py
+++ b/curate.py
@@ -1,5 +1,4 @@
 import ebay_api
-import numpy as np
 
 '''
 file used to compile methods from ebay_api.py for curating training data
@@ -13,8 +12,10 @@ nvl_training = curate.nvl_training(training)
 dropd = curate.drop_nvl_cols(nvl_training)
 
 expanded_dfs = curate.expand_nvlclass(class_training, dropd)
+
 expanded_class = expanded_dfs[0]
 expanded_dropd = expanded_dfs[1]
 dict_pics = expanded_dfs[2]
+
 # TODO # need to replace expanded df's PictureURL col values with destination urls
 # TODO # still have the problem of duplicate listings. Possibly take care of this before you run curate
diff --git a/ebay_api.py b/ebay_api.py
index f1077fe..3f40a17 100644
--- a/ebay_api.py
+++ b/ebay_api.py
@@ -267,28 +267,33 @@ class CurateData:
         instances. Modifies both class training and dropd dfs. Appends custom
         image url dict {'source':'destination'}.
         '''
-        expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO DROp rows with nan values in PicturlURL cell
+        expanded_class = class_training.explode('PictureURL').reset_index(drop=True) # TODO drop duplicates here or before instantiating curate object
         expanded_class = expanded_class.dropna(subset=['PictureURL'])
-        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop rows with nan values in PictureURL cells either here or before somewhere
+        expanded_class = expanded_class.drop_duplicates(subset=['PictureURL'])
+        expanded_class.loc[:,'PictureURL'] = expanded_class.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
+        expanded_dropd = dropd.explode('PictureURL').reset_index(drop=True) # TODO Drop duplicates here or before instantiating curate object
         expanded_dropd = expanded_dropd.dropna(subset=['PictureURL'])
+        expanded_dropd = expanded_dropd.drop_duplicates(subset=['PictureURL'])
+        expanded_dropd.loc[:,'PictureURL'] = expanded_dropd.loc[:, 'PictureURL'].apply(lambda x: dict_pics[x])
+
         expanded_dropd = self.extract_df(expanded_dropd) # convert lists to values
 
         dict_pics_list = list(set(expanded_class.PictureURL.to_list())) # prolly need to create set long before df... immediately after Shopping or trading call
         destination = 'your target folder' # decide whether or not you want to set a default folder to have the user define it as input every time. or have this only 
         # defined in the download function
-        dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k).group()+'.jpg' for k in dict_pics_list} # TODO determine how to implement destination variable
-        # TODO still getting exceptions such as 'https://i.ebayimg.com/images/g/RG8AAOSwqMtd1esL/s-l1600.jpg'. add conditions to dict comprehension. 
+        dict_pics = {k:destination+re.search(r'[^/]+(?=/\$_|.jpg)', k, re.IGNORECASE).group()+'.jpg' for k in dict_pics_list}
+        expanded_class = expanded_class
+        # with open('dict_pics.txt','w+') as f: # TODO open if it exists, or write if not, then extend the dictionary with dict_pics
 
-        # re.search(r'\w+(?=/\$_)', a).group()
 
-        return expanded_class, expanded_dropd # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
+        return expanded_class, expanded_dropd, dict_pics # TODO still need to replace source url to destination url in df cols and create custom dict {<source>, <distination>}
 
     def dl_pictures(self):
         '''
         Downloads pictures from api to local storage using custom master dict
         '''
 
-        with open('dict_pic.txt', 'w+') as jf: # TODO requires cleaning up
+        with open('dict_pic.txt', 'w+') as jf: # avoid duplicate logic goes here... I think
             dict_pics = json.load(jf)
 
         r = requests.get('<dict_pic>', stream=True)