From 314de17d90525e003eb9ba1e5434f6995da3a5dc Mon Sep 17 00:00:00 2001 From: papanikos <n.pappas@uu.nl> Date: Thu, 14 Jan 2021 08:40:32 +0100 Subject: [PATCH] fix for pandas warnings --- workflow/scripts/get_lca.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/get_lca.py b/workflow/scripts/get_lca.py index 3790ebc..4d6a9a8 100644 --- a/workflow/scripts/get_lca.py +++ b/workflow/scripts/get_lca.py @@ -142,14 +142,26 @@ if __name__ == '__main__': index_col='contig' ) prediction_cols = [c for c in data.columns if c.endswith('_pred')] + + # Slice and make a copy to supress SettingWithCopyWarning + hosts = data[prediction_cols].copy() + # Set the data type to string so the sanitizing will work + for col in prediction_cols: + hosts[col] = hosts[col].astype('string') + # Sanitize columns to be able to query the taxonomy - hosts = data[prediction_cols] if 'vhulk_pred' in prediction_cols: - hosts['vhulk_pred'] = hosts['vhulk_pred'].str.replace('_', ' ') + hosts['vhulk_pred'] = hosts['vhulk_pred'].str.replace('_', ' ') if 'wish_pred' in prediction_cols: - hosts['wish_pred'] = [' '.join(i.split(';')[-2].split()[:2]) - for i in data['wish_pred'].values] + # Original is superkingdom;kingdom;...;species;name + # This selects species + hosts['wish_pred'] = hosts['wish_pred'].str.split(';').str.get(-2) + # This splits the species and gets the first 2 elements + # sometimes species contains strain info two + hosts['wish_pred'] = hosts['wish_pred'].str.split().str[:2] + # Rejoin the species in a single string `genus species` + hosts['wish_pred'] = hosts['wish_pred'].str.join(' ') hosts = hosts.apply(translate_row) -- GitLab