Skip to content

Commit

Permalink
fix for dtype output. This test is also included in the unit test now
Browse files Browse the repository at this point in the history
  • Loading branch information
erdogant committed Jul 14, 2020
1 parent ff9ed3e commit efff4bc
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 38 deletions.
2 changes: 1 addition & 1 deletion df2onehot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

__author__ = 'Erdogan Tasksen'
__email__ = '[email protected]'
__version__ = '0.1.9'
__version__ = '0.1.10'

# module level doc-string
__doc__ = """
Expand Down
42 changes: 5 additions & 37 deletions df2onehot/df2onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
try:
dfc, idxempty = dict2df(df.iloc[:, idx])
# dfc = pd.DataFrame.from_records(df.iloc[:,idx])
if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s] [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))
if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s] [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))
except:
if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [failed]' %(df.columns[idx], makespaces))
# dfc = df.iloc[:,idx].astype(str)
Expand All @@ -279,8 +279,6 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
# Expand every columns that contains either list
for idx in idxCol:
makespaces = ''.join([' '] * (max_str_len - len(df.columns[idx])))
# if verbose>=3: print('[df2onehot] >%s column is detected: [%s]' %(dtypes[idx], df.columns[idx]))
if verbose>=3: print('[df2onehot] >[%s]%s >deep extract >[%s]' %(df.columns[idx], makespaces, dtypes[idx]))
# Convert str/float/int to type
df, uifeat = _col2type(df, dtypes, idx)
# Convert column into onehot
Expand All @@ -289,13 +287,14 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
dftot2 = _concat(dftot2, dfc)
# Add idx to remove
idxrem2.append(idx)
if verbose>=3: print('[df2onehot] >[%s]%s >deep extract > [%s] [%d]' %(df.columns[idx], makespaces, dtypes[idx], dfc.shape[1]))

# Drop columns that are expanded
idxrem = idxrem1+idxrem2
if len(idxrem)>0:
# Remove the extracted column names from list and dict
df.drop(labels = df.columns[idxrem].values, axis=1, inplace=True)
idxkeep = np.setdiff1d(np.arange(0, df.shape[1]), idxrem)
df.drop(labels = df.columns[idxrem].values, axis=1, inplace=True)
dtypes = np.array(dtypes)
dtypes = list(dtypes[idxkeep])
# Combine the extracted list and dict data
Expand All @@ -307,9 +306,10 @@ def _deep_extract(df, dtypes, perc_min_num=None, verbose=3):
# Combine into dataframe
df = pd.concat([df, dftot], axis=1)
dtypes = dtypes + dtypest
if verbose>=3: print('[df2onehot] >[%d] additional columns extracted by deep extract.' %(dftot1.shape[1]+dftot2.shape[1]))

# Return
if verbose>=3: print('[df2onehot] >[%d] additional columns extracted by deep extract.' %(dftot1.shape[1]+dftot2.shape[1]))
if df.shape[1]!=len(dtypes): raise Exception('[df2onehot] >Error: size of dtypes and dataframe does not match.')
return(df, dtypes)


Expand All @@ -327,38 +327,6 @@ def _findcol(x, cols):
# SLICE COPY WARNING!
return(np.isin(cols, x))


# %% Example data
# def import_example(getfile='titanic'):
# """Import example.

# Description
# -----------

# Parameters
# ----------
# getfile : String, optional
# 'titanic'

# Returns
# -------
# df : DataFrame

# """

# if getfile=='titanic':
# getfile='titanic_train.zip'

# print('[df2onehot] >Loading %s..' %getfile)
# curpath = os.path.dirname(os.path.abspath(__file__))
# PATH_TO_DATA=os.path.join(curpath, 'data', getfile)
# if os.path.isfile(PATH_TO_DATA):
# df=pd.read_csv(PATH_TO_DATA, sep=',')
# return df
# else:
# print('[df2onehot] >Oops! Example data not found!')
# return None

# %% Import example dataset from github.
def import_example(data='titanic', url=None, sep=',', verbose=3):
"""Import example dataset from github source.
Expand Down
2 changes: 2 additions & 0 deletions tests/test_df2onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def test_df2onehot():
assert np.all(ycounts==np.array([148, 4, 891, 7, 891, 3, 2, 7, 2, 681, 1]))
# TEST WHETHER SIMILAR VALUES ARE SET TO TRUE
assert out['onehot']['all_true'].sum()==df.shape[0]
# TEST WHETHER SIZE MATCHES
assert out['numeric'].shape[1]==len(out['dtypes'])

df = import_example()

Expand Down

0 comments on commit efff4bc

Please sign in to comment.