fix for dtype output. This test is also included in the unit test now

erdogant · Jul 14, 2020 · efff4bc · efff4bc
1 parent ff9ed3e
commit efff4bc
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 38 deletions.
diff --git a/df2onehot/__init__.py b/df2onehot/__init__.py
@@ -11,7 +11,7 @@
 
 __author__ = 'Erdogan Tasksen'
 __email__ = '[email protected]'
-__version__ = '0.1.9'
+__version__ = '0.1.10'
 
 # module level doc-string
 __doc__ = """

diff --git a/df2onehot/df2onehot.py b/df2onehot/df2onehot.py
@@ -259,7 +259,7 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
             try:
                 dfc, idxempty = dict2df(df.iloc[:, idx])
                 # dfc = pd.DataFrame.from_records(df.iloc[:,idx])
-                if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s] [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))
+                if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s]  [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))
             except:
                 if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [failed]' %(df.columns[idx], makespaces))
                 # dfc = df.iloc[:,idx].astype(str)
@@ -279,8 +279,6 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
         # Expand every columns that contains either list
         for idx in idxCol:
             makespaces = ''.join([' '] * (max_str_len - len(df.columns[idx])))
-            # if verbose>=3: print('[df2onehot] >%s column is detected: [%s]' %(dtypes[idx], df.columns[idx]))
-            if verbose>=3: print('[df2onehot] >[%s]%s >deep extract >[%s]' %(df.columns[idx], makespaces, dtypes[idx]))
             # Convert str/float/int to type
             df, uifeat = _col2type(df, dtypes, idx)
             # Convert column into onehot
@@ -289,13 +287,14 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
             dftot2 = _concat(dftot2, dfc)
             # Add idx to remove
             idxrem2.append(idx)
+            if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s]  [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))
 
     # Drop columns that are expanded
     idxrem = idxrem1+idxrem2
     if len(idxrem)>0:
         # Remove the extracted column names from list and dict
-        df.drop(labels = df.columns[idxrem].values, axis=1, inplace=True)
         idxkeep = np.setdiff1d(np.arange(0, df.shape[1]), idxrem)
+        df.drop(labels = df.columns[idxrem].values, axis=1, inplace=True)
         dtypes = np.array(dtypes)
         dtypes = list(dtypes[idxkeep])
         # Combine the extracted list and dict data
@@ -307,9 +306,10 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
         # Combine into dataframe
         df = pd.concat([df, dftot], axis=1)
         dtypes = dtypes + dtypest
+        if verbose>=3: print('[df2onehot] >[%d] additional columns extracted by deep extract.' %(dftot1.shape[1]+dftot2.shape[1]))
 
     # Return
-    if verbose>=3: print('[df2onehot] >[%d] additional columns extracted by deep extract.' %(dftot1.shape[1]+dftot2.shape[1]))
+    if df.shape[1]!=len(dtypes): raise Exception('[df2onehot] >Error: size of dtypes and dataframe does not match.')
     return(df, dtypes)
 
 
@@ -327,38 +327,6 @@ def _findcol(x, cols):
     # SLICE COPY WARNING!
     return(np.isin(cols, x))
 
-
-# %% Example data
-# def import_example(getfile='titanic'):
-#     """Import example.
-
-#     Description
-#     -----------
-
-#     Parameters
-#     ----------
-#     getfile : String, optional
-#         'titanic'
-
-#     Returns
-#     -------
-#     df : DataFrame
-
-#     """
-
-#     if getfile=='titanic':
-#         getfile='titanic_train.zip'
-
-#     print('[df2onehot] >Loading %s..' %getfile)
-#     curpath = os.path.dirname(os.path.abspath(__file__))
-#     PATH_TO_DATA=os.path.join(curpath, 'data', getfile)
-#     if os.path.isfile(PATH_TO_DATA):
-#         df=pd.read_csv(PATH_TO_DATA, sep=',')
-#         return df
-#     else:
-#         print('[df2onehot] >Oops! Example data not found!')
-#         return None
-
 # %% Import example dataset from github.
 def import_example(data='titanic', url=None, sep=',', verbose=3):
     """Import example dataset from github source.

diff --git a/tests/test_df2onehot.py b/tests/test_df2onehot.py
@@ -15,6 +15,8 @@ def test_df2onehot():
     assert np.all(ycounts==np.array([148,   4, 891,   7, 891,   3,   2,   7,   2, 681, 1]))
     # TEST WHETHER SIMILAR VALUES ARE SET TO TRUE
     assert out['onehot']['all_true'].sum()==df.shape[0]
+    # TEST WHETHER SIZE MATCHES
+    assert out['numeric'].shape[1]==len(out['dtypes'])
 
     df = import_example()