analysis.py.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ML\n",
    "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn import metrics\n",
    "\n",
    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
    "from scipy.spatial.distance import squareform, pdist\n",
    "\n",
    "# Data handling\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from collections import Counter\n",
    "import operator\n",
    "\n",
    "# Plotting\n",
    "%matplotlib inline\n",
    "# import matplotlib.patches as mpatches\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Read and write\n",
    "from pathlib import Path # filesystem\n",
    "\n",
    "## sklearn is giving me\n",
    "## sklearn/model_selection/_search.py:814: DeprecationWarning: \n",
    "## The default of the `iid` parameter will change from True to False in version 0.22 \n",
    "## and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
    "##  DeprecationWarning)\n",
    "\n",
    "# Suppress that\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore', category=DeprecationWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initial setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## GLOBAL VARIABLES\n",
    "\n",
    "These are variables that are used throughout the notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set this to False if you don't want to scale the feature values\n",
    "SCALE_DATA = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Only change these if you know what you are doing\n",
    "\n",
    "# A dictionary that maps original feature names to final ones\n",
    "FEATURES_MAP = dict(zip(['jaccard_score', 'same_score', \n",
    "                            'inwards_score', 'outwards_score', \n",
    "                            'avg_distance', 'mean_ani', \n",
    "                            'mean_aai'],\n",
    "                        ['Co-occurrence', 'Co-orientation',\n",
    "                            'Convergent', 'Divergent',\n",
    "                            'Average Distance', 'Mean ANI',\n",
    "                            'Mean AAI']\n",
    "                       )\n",
    "                   )\n",
    "\n",
    "# A list to only get the features\n",
    "# Useful for slicing data frames\n",
    "FEATURES = ['jaccard_score', 'same_score', \n",
    "            'inwards_score', 'outwards_score', \n",
    "            'avg_distance', 'mean_ani', \n",
    "            'mean_aai']\n",
    "                    \n",
    "# Features with label appended\n",
    "# useful for plotting based on label\n",
    "FEAT_LAB = FEATURES.copy()\n",
    "FEAT_LAB.append('label')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## HELPER FUNCTIONS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "* Reading data\n",
    "\n",
    "Some of these functions were written **before** filtering for max distance, so they try to address this issue."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_scores_table(scores_fp, label=None):\n",
    "    \"\"\"\n",
    "    Read a table from a tsv file provided as a path.\n",
    "    Return a dataframe.\n",
    "    \n",
    "    If label is set, append a column named `label` with\n",
    "    the provided value\n",
    "    \"\"\"\n",
    "    scores_df = pd.read_csv(scores_fp, sep='\\t')\n",
    "    scores_df['interaction'] = scores_df['pvog1'] + '-' + scores_df['pvog2']\n",
    "    # Unpack features list because I want interaction prepended.\n",
    "    scores_df = scores_df[['interaction', *FEATURES]]    \n",
    "    if label is not None:\n",
    "        scores_df['label'] = label\n",
    "    return scores_df\n",
    "\n",
    "\n",
    "def concat_data_frames(pos_df, \n",
    "                       neg_df, \n",
    "                       subsample=False,\n",
    "                       clean=True,\n",
    "                       is_scaled=True,\n",
    "                      ):\n",
    "    \"\"\"\n",
    "    Concatenate two dataframes.\n",
    "    \n",
    "    subsample:bool \n",
    "        Subsample the `neg_df` to a number of\n",
    "        observations equal to the number of pos_df \n",
    "        (balance datasets)\n",
    "    \n",
    "    clean:bool\n",
    "        Remove observations with a value of `avg_distance` == 100000 or 1.\n",
    "    \n",
    "    is_scaled:bool\n",
    "        The features have been scaled to a range 0-1\n",
    "    \n",
    "    Return:\n",
    "    concat_df: pd.DataFrame\n",
    "        The concatenated data frame\n",
    "    \"\"\"\n",
    "    \n",
    "    if clean is True:\n",
    "        pos_df = remove_ambiguous(pos_df)\n",
    "        neg_df = remove_ambiguous(neg_df)\n",
    "    \n",
    "    n_positives = pos_df.shape[0]\n",
    "    n_negatives = neg_df.shape[0]\n",
    "    \n",
    "    # Remove possible duplicate interactions from the negatives\n",
    "    # This might happen because of the random selection when creating the set\n",
    "    # Why I also select more negatives to begin with\n",
    "    neg_df = neg_df.loc[~neg_df['interaction'].isin(pos_df['interaction'])]\n",
    "    \n",
    "    if (n_positives != n_negatives) and (subsample is True):\n",
    "        neg_df = neg_df.sample(n=n_positives, random_state=1)\n",
    "    concat_df = pd.concat([pos_df, neg_df])\n",
    "    \n",
    "    assert concat_df[concat_df.duplicated(subset=['interaction'])].empty == True, concat_df.loc[concat_df.duplicated(subset=['interaction'], keep=False)]\n",
    "    \n",
    "    return concat_df\n",
    "\n",
    "def scale_df(input_df):\n",
    "    \"\"\"\n",
    "    Scale all feature values in the data frame to [0-1].\n",
    "    \"\"\"\n",
    "    maxes = input_df[FEATURES].max(axis=0)\n",
    "    scaled_data = input_df[FEATURES].divide(maxes)\n",
    "    if 'label' in input_df.columns:\n",
    "        scaled_df = pd.concat([input_df['interaction'], scaled_data, input_df['label']], axis=1)\n",
    "    else:\n",
    "        scaled_df = pd.concat([input_df['interaction'], scaled_data], axis=1)\n",
    "    return scaled_df\n",
    "\n",
    "def remove_ambiguous(input_df):\n",
    "    \"\"\"\n",
    "    Select observations in the `input_df` that have feature values\n",
    "    \"\"\"\n",
    "    df_clean = input_df[(input_df.jaccard_score != 0)] # This is true if they don't co-occur\n",
    "    return df_clean\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analysis\n",
    "\n",
    "From this point on all the magic happens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get all the files in a list of Path objects\n",
    "filepaths = list(map(Path, snakemake.input))\n",
    "# Remove the last element\n",
    "filtered_scores_tsv = filepaths.pop(-1)\n",
    "negatives_fp_list = []\n",
    "for fp in filepaths:\n",
    "    # Grab the positives\n",
    "    if 'positives' in fp.name:\n",
    "        positives_tsv = fp\n",
    "    else:\n",
    "        # Append to the list of negatives\n",
    "        negatives_fp_list.append(fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Positives are always the same\n",
    "pos_df = read_scores_table(positives_tsv, label = 1)\n",
    "\n",
    "# Negatives are stored in the list above\n",
    "# Select one for visualization\n",
    "neg_df = read_scores_table(negatives_fp_list[3], label = 0)\n",
    "if SCALE_DATA is True:\n",
    "    pos_df = scale_df(pos_df)\n",
    "    neg_df = scale_df(neg_df)\n",
    "    \n",
    "posneg = concat_data_frames(pos_df, neg_df,\n",
    "                           clean=True,\n",
    "                           subsample=True,\n",
    "                           is_scaled=SCALE_DATA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_features_correlation(data_df):\n",
    "    # Subset to features only\n",
    "    cor_data = data_df[FEATURES]\n",
    "    cor_df = cor_data.corr()\n",
    "    \n",
    "    plt.figure(figsize=(10, 8))\n",
    "    sns.heatmap(cor_df, annot=True, \n",
    "            cmap=sns.color_palette(\"RdBu_r\"), \n",
    "            vmin=-1, \n",
    "            vmax=1, \n",
    "            cbar_kws = {\"shrink\" : 0.5, \n",
    "                        \"ticks\" : [-1, -0.5, 0, 0.5, 1],\n",
    "                       }\n",
    "           )    \n",
    "    plt.show()\n",
    "\n",
    "## THIS IS NO LONGER USED in favor of the pairplot\n",
    "# you can call it with\n",
    "#  plot_features_dist(posneg, FEATURES, is_scaled = SCALE_DATA)\n",
    "def plot_features_dist(data, features, is_scaled=False):\n",
    "    \"\"\"\n",
    "    Make a gridplot of all features distributions\n",
    "    \n",
    "    data: A pd.DataFrame with features and a label column\n",
    "    features: A list of features to use\n",
    "    \"\"\"\n",
    "    if is_scaled:\n",
    "        distance_threshold = 1\n",
    "    else:\n",
    "        distance_threshold = 1000000\n",
    "        \n",
    "    df_list=[data.loc[:,[f, 'label']] for f in features]\n",
    "    # Initialize a figure\n",
    "    fig, axes = plt.subplots(3, 3, figsize = (8,4), dpi=300)\n",
    "    # Flatten the axes for plotting in the correct ax object\n",
    "    ax_list = axes.flatten()\n",
    "\n",
    "    # Workaround to add the legend\n",
    "    red_patch = mpatches.Patch(color='red', label='positives', alpha=0.5)\n",
    "    blue_patch = mpatches.Patch(color='b', label='negatives', alpha=0.5)\n",
    "    plt.figlegend(handles=[red_patch, blue_patch], loc='lower center')\n",
    "    # Make the plots\n",
    "    for i in range(len(df_list)):\n",
    "        df=df_list[i]\n",
    "        if features[i] == \"avg_distance\":\n",
    "            x0 = df.loc[((df.label==0) & (df.avg_distance != distance_threshold)), features[i]]\n",
    "            x1 = df.loc[((df.label==1) & (df.avg_distance != distance_threshold)), features[i]]\n",
    "        else:\n",
    "            x0 = df.loc[df.label==0, features[i]] \n",
    "            x1 = df.loc[df.label==1, features[i]]\n",
    "#     a1 = sns.distplot(x0, ax=axes[coords[0], coords[1]], axlabel=features[i],color='b', label='negatives')\n",
    "#     a2 =sns.distplot(x1, ax=axes[coords[0], coords[1]], axlabel=features[i], color='r', label='positives')\n",
    "        ax = ax_list[i]\n",
    "        ax.hist(x0, color = 'b', alpha=.5)\n",
    "        ax.hist(x1, color = 'r', alpha= .5)\n",
    "        ax.set_title(\"{} (N={}/{})\".format(features[i], \n",
    "                                           x0.shape[0], \n",
    "                                           x1.shape[0]), \n",
    "                                           fontsize=8\n",
    "                    )\n",
    "\n",
    "    plt.subplots_adjust(hspace=0.7)\n",
    "\n",
    "    axes[2,2].remove()\n",
    "    axes[2,1].remove()    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "pair = sns.pairplot(posneg[FEAT_LAB],\n",
    "                    hue = 'label', \n",
    "                    vars=FEATURES,\n",
    "                    markers = [\"+\", \"x\"],\n",
    "                    palette = {\n",
    "                        0 : 'b',\n",
    "                        1 : 'r',\n",
    "                    },\n",
    "                    corner=True,\n",
    "#              diag_kind=\"hist\",\n",
    "                    plot_kws = {\n",
    "                        \"alpha\": 0.65,\n",
    "                    },\n",
    "                    diag_kws={\n",
    "                        \"clip\" : [0. , 1.],\n",
    "                    },\n",
    "                )\n",
    "pair.set(xlim=(-0.1, 1.1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_features_correlation(posneg)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Cluster interactions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a copy of the original df for plotting\n",
    "d_posneg = posneg\n",
    "\n",
    "D_posneg = pdist(d_posneg[FEATURES])\n",
    "D_posneg = D_posneg / D_posneg.max() # Scale the distances to be [0-1]\n",
    "Z_posneg = linkage(D_posneg, method=\"average\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Append a color column based on the label value\n",
    "# https://stackoverflow.com/a/26887820\n",
    "\n",
    "def color_label_interaction(row):\n",
    "    if row['label'] == 1:\n",
    "        return 'red'\n",
    "    else:\n",
    "        return 'blue'\n",
    "\n",
    "d_posneg['color'] = d_posneg.apply(lambda row: color_label_interaction(row), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20, 30))\n",
    "dn = dendrogram(Z_posneg,\n",
    "                labels=d_posneg['interaction'].values, \n",
    "                orientation='left',\n",
    "                leaf_font_size=12,\n",
    "                color_threshold=0.1,\n",
    "                above_threshold_color='black',\n",
    "               )\n",
    "plt.axvline(x=0.1, linestyle='--')\n",
    "plt.title(\"Average linkage based on Euclidean distances\", size=14)\n",
    "# plt.tick_params(axis='y', labelsize=12, direction='out')\n",
    "ax=plt.gca()\n",
    "ylbls = ax.get_ymajorticklabels()\n",
    "for lbl in ylbls:\n",
    "    lbl.set_color(d_posneg.loc[d_posneg.interaction == lbl.get_text(), 'color'].values[0])\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Distribution of distances pos vs neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "D_pos = pdist(pos_df[FEATURES])\n",
    "D_neg = pdist(neg_df[FEATURES])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(10,8))\n",
    "\n",
    "ax = sns.kdeplot(D_pos,\n",
    "                 fill=True,\n",
    "                 alpha = 0.5,\n",
    "                 color='r',\n",
    "                 \n",
    "                )\n",
    "ax = sns.kdeplot(D_neg,\n",
    "                 fill=True,\n",
    "                 alpha = 0.5,\n",
    "                 color='b'\n",
    "                )\n",
    "\n",
    "ax.set_title(\"Distances distribution\", size=14)\n",
    "ax.set_xlabel(\"Distance\", size=14)\n",
    "ax.set_ylabel(\"Density\", size=14)\n",
    "ax.legend([\"Positives\", \"Negatives\"])\n",
    "ax.tick_params(axis='both', which='major', labelsize=12)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Random Forest"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Model selection and hyperparameter tuning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "Define a search space for the parameters to be sampled from. \n",
    "Less exhaustive than a full grid search, but quicker."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of trees\n",
    "n_estimators = [int(x) for x in range(100,5000,200)]\n",
    "\n",
    "# Number of features to consider at every split\n",
    "# 'auto' uses sqrt(n_features), a float uses int(float * n_features)\n",
    "max_features = ['auto', .5, 1.]\n",
    "\n",
    "# Maximum number of levels in each tree\n",
    "max_depth = [int(x) for x in range(10, 60, 10)]\n",
    "max_depth.append(None)\n",
    "\n",
    "# Minimum number of samples required to split a node\n",
    "min_samples_split = [2, 5, 10]\n",
    "\n",
    "# Minimum number of samples required at each leaf node\n",
    "min_samples_leaf = [1, 2, 4]\n",
    "\n",
    "# Method for selecting samples for training\n",
    "bootstrap = [True, False]\n",
    "\n",
    "# Put them all in a dictionary\n",
    "params_space = {\n",
    "    \"max_features\": max_features,\n",
    "    \"n_estimators\" : n_estimators,\n",
    "    \"max_depth\" : max_depth,\n",
    "    \"min_samples_split\" : min_samples_split,\n",
    "    \"min_samples_leaf\" : min_samples_leaf,\n",
    "    \"bootstrap\": bootstrap,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a color mapping for the negative sets\n",
    "\n",
    "negative_sets = []\n",
    "for negative in negatives_fp_list:\n",
    "    set_name = negative.name.split('.')[0]\n",
    "    negative_sets.append(set_name)\n",
    "    \n",
    "color_palette = sns.color_palette(\"colorblind\", len(negative_sets))\n",
    "\n",
    "color_dict = dict(zip(negative_sets, color_palette))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_fdr(confusion_matrix):\n",
    "    \"\"\"\n",
    "    Calculate FDR based on the values of the confusion_matrix\n",
    "    \n",
    "    Args:\n",
    "        confusion_matrix: 2d np.array: The confusion matrix returned\n",
    "            by sklearn.metrics.confusion_matrix\n",
    "    Return:\n",
    "        fdr: float: A single value representing the FDR of the model\n",
    "            based on count values of FP/FP+TP\n",
    "    \"\"\"\n",
    "    tn, fp, fn, tp = confusion_matrix.ravel()\n",
    "    fdr = fp / (fp + tp)\n",
    "    return round(fdr, 3)\n",
    "    \n",
    "\n",
    "def get_metric_value(metric_string, \n",
    "                     truth_array, \n",
    "                     prediction_array, \n",
    "                     prediction_proba_array):\n",
    "    \"\"\"\n",
    "    Helper function to get metric values based on a string representation.\n",
    "    \"\"\"\n",
    "    if metric_string == 'accuracy':\n",
    "        return metrics.accuracy_score(truth_array, prediction_array)\n",
    "    if metric_string == 'accuracy_abs':\n",
    "        return metrics.accuracy_score(truth_array, prediction_array, normalize = False)\n",
    "    if metric_string == 'precision':\n",
    "        return metrics.precision_score(truth_array, prediction_array)\n",
    "    if metric_string == 'recall':\n",
    "        return metrics.recall_score(truth_array, prediction_array)\n",
    "    if metric_string == 'f1_score':\n",
    "        return metrics.f1_score(truth_array, prediction_array)\n",
    "    if metric_string == 'roc_auc_score':\n",
    "        return metrics.roc_auc_score(truth_array, prediction_proba_array[:,1])\n",
    "    if metric_string == 'fpr':\n",
    "        return metrics.roc_curve(truth_array, prediction_proba_array[:,1])[0]\n",
    "    if metric_string == 'tpr':\n",
    "        return metrics.roc_curve(truth_array, prediction_proba_array[:,1])[1]\n",
    "    if metric_string == 'confusion_matrix':\n",
    "        return metrics.confusion_matrix(truth_array, prediction_array)\n",
    "    if metric_string == 'fdr':\n",
    "        cf = metrics.confusion_matrix(truth_array, prediction_array)\n",
    "        fdr = calculate_fdr(cf)\n",
    "        return fdr\n",
    "    if metric_string == 'mcc':\n",
    "        return metrics.matthews_corrcoef(truth_array, prediction_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A list of all the metrics we are storing\n",
    "# _metrics so we don't override the imported sklearn.metrics\n",
    "_metrics = [\n",
    "    'accuracy', \n",
    "    'accuracy_abs', \n",
    "    'f1_score', \n",
    "    'precision', \n",
    "    'recall', \n",
    "    'roc_auc_score', \n",
    "    'fpr', \n",
    "    'tpr',\n",
    "    'confusion_matrix',\n",
    "    'fdr',\n",
    "    'mcc'\n",
    "    ]\n",
    "\n",
    "# A dictionary that will hold all the values\n",
    "# for the metrics above\n",
    "all_metrics = {}\n",
    "# A dictionary that holds the parameters for the best model\n",
    "# after each iteration\n",
    "best_models = {}\n",
    "\n",
    "# A dictionary that holds feature importances\n",
    "# for each iteration\n",
    "feat_importances = {}\n",
    "\n",
    "for i, negative in enumerate(negatives_fp_list):\n",
    "    \n",
    "    # Make a copy of the list with the negative_files\n",
    "    copy_of_list = negatives_fp_list.copy()    \n",
    "    # Get the name of the current negative\n",
    "    training_set = negative.name.split('.')[0]\n",
    "    # Read the current negative data in a data frame\n",
    "    neg_df = read_scores_table(negative, label=0)\n",
    "    # ... scale it\n",
    "    neg = scale_df(neg_df)\n",
    "    # Concatenate with the positive\n",
    "    # A ground truth set is constructed\n",
    "    posneg = concat_data_frames(pos_df, \n",
    "                                neg, \n",
    "                                subsample=True, \n",
    "                                clean=True, \n",
    "                                is_scaled=True)\n",
    "    \n",
    "    # Split the frame to features and the label\n",
    "    X = posneg[FEATURES]\n",
    "    y = posneg['label']\n",
    "    \n",
    "    # Train test split\n",
    "    X_train, X_holdout, y_train, y_holdout = train_test_split(X,\n",
    "                                                              y,\n",
    "                                                              test_size=0.3,\n",
    "                                                              random_state=1)\n",
    "    \n",
    "    # Instantiate the Randomized Search\n",
    "    models = RandomizedSearchCV(\n",
    "        estimator = RandomForestClassifier(random_state=1),\n",
    "        param_distributions = params_space,\n",
    "        n_iter = 500, # number of parameter settings to sample\n",
    "        cv = 5, # Use 5-folds for CV\n",
    "        random_state = 1, # Set random state for reproducibility\n",
    "        n_jobs = snakemake.threads,\n",
    "        verbose=1 # Print some messages to keep track of progress\n",
    "        )\n",
    "    # Execute the search\n",
    "    search = models.fit(X_train, y_train)\n",
    "    \n",
    "    # Make a classifier out of the best model\n",
    "    best_model = RandomForestClassifier(**search.best_params_, \n",
    "                                        random_state=1)\n",
    "    \n",
    "    # Fit the best model on the training data\n",
    "    best_model.fit(X_train, y_train)\n",
    "    \n",
    "    # Predict labels and probabilities for the test/holdout set\n",
    "    holdout_pred = best_model.predict(X_holdout)\n",
    "    holdout_proba = best_model.predict_proba(X_holdout)\n",
    "    \n",
    "    ###############\n",
    "    # Store results\n",
    "    ###############\n",
    "    # Create a tuple of the strings of the training set\n",
    "    # as a key for the all_metrics dict\n",
    "    self_key = (training_set, training_set)\n",
    "    \n",
    "    # The get_metric_value is defined above\n",
    "    all_metrics[self_key] = {metric : get_metric_value(metric,\n",
    "                                                       y_holdout,\n",
    "                                                       holdout_pred,\n",
    "                                                       holdout_proba)\n",
    "                            for metric in _metrics}\n",
    "    \n",
    "    best_models[training_set] = best_model\n",
    "    \n",
    "    importances = best_model.feature_importances_      \n",
    "    feat_importances[self_key] = dict(zip(FEATURES, importances))\n",
    "    \n",
    "    # Remove the current negative set file from the copy of the list\n",
    "    # The rest will be used for validation\n",
    "    validation_sets = [i for i in copy_of_list if i != negative]\n",
    "\n",
    "    for validation_set in validation_sets:\n",
    "        # Repeat the same process as above, but without optimizing\n",
    "        validation_set_name = validation_set.name.split('.')[0]\n",
    "        negg = read_scores_table(validation_set, label=0)\n",
    "        scaled_negg = scale_df(negg)\n",
    "        \n",
    "        pn = concat_data_frames(pos_df, \n",
    "                                scaled_negg, \n",
    "                                subsample=True, \n",
    "                                clean=True, \n",
    "                                is_scaled=True)\n",
    "        XX = pn[FEATURES]\n",
    "        yy = pn['label']\n",
    "        \n",
    "        XX_train, XX_holdout, yy_train, yy_holdout = train_test_split(XX, yy, test_size=0.3, random_state=1)\n",
    "        best_model.fit(XX_train, yy_train)\n",
    "        holdout_pred = best_model.predict(XX_holdout)\n",
    "        holdout_proba = best_model.predict_proba(XX_holdout)\n",
    "        \n",
    "        \n",
    "        combo_key = (training_set, validation_set_name)\n",
    "        all_metrics[combo_key] = {metric : get_metric_value(metric, yy_holdout, holdout_pred, holdout_proba) \n",
    "                                 for metric in _metrics}\n",
    "        \n",
    "        importances = best_model.feature_importances_      \n",
    "        feat_importances[combo_key] = dict(zip(FEATURES, importances))\n",
    "    \n",
    "    print(\"Finished set : {}\".format(i))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Save results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Results are stored in the results/RF directory\n",
    "# Create it, or do nothing\n",
    "RF_dir = Path(\"results/RF\")\n",
    "RF_dir.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "## metrics\n",
    "metrics_df = pd.DataFrame.from_dict(all_metrics, orient='index', columns=_metrics)\n",
    "\n",
    "# Append columns TS (Training Set) and VS (Validation Set)\n",
    "# For slicing\n",
    "metrics_df['TS'] = [i[0] for i in metrics_df.index.values]\n",
    "metrics_df['VS'] = [i[1] for i in metrics_df.index.values]\n",
    "\n",
    "# Pickle is used to store the fpr, tpr, confusion matrix values as arrays\n",
    "# otherwise it messes up the column formatting\n",
    "metrics_fp = RF_dir / Path('metrics.pkl')\n",
    "metrics_tsv = RF_dir / Path('metrics.tsv')\n",
    "metrics_df.to_pickle(metrics_fp)\n",
    "## Drop the fpr,tpr, confusion_matrix (arrays) to write to tsv\n",
    "metrics_df.drop(columns=['fpr', \n",
    "                         'tpr',\n",
    "                         'confusion_matrix']).to_csv(metrics_tsv, \n",
    "                                        sep='\\t', \n",
    "                                        index=False)\n",
    "\n",
    "## features\n",
    "features_df = pd.DataFrame.from_dict(feat_importances,\n",
    "                                     orient = 'index',\n",
    "                                     columns = FEATURES)\n",
    "features_df['TS'] = [i[0] for i in features_df.index.values]\n",
    "features_df['VS'] = [i[1] for i in features_df.index.values]\n",
    "features_fp = RF_dir / Path(\"features.tsv\")\n",
    "\n",
    "features_df.to_csv(features_fp, sep='\\t', index=False)\n",
    "\n",
    "\n",
    "## models\n",
    "models_dir = RF_dir / Path(\"models\")\n",
    "models_dir.mkdir(exist_ok=True)\n",
    "for dataset in best_models:\n",
    "    pkl_fname = Path('{}.RF.pkl'.format(dataset))\n",
    "    pkl_fpath = models_dir.joinpath(pkl_fname)\n",
    "    with open(pkl_fpath, 'wb') as fout:\n",
    "        pickle.dump(best_models[dataset], fout)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Best model across the board"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate min, max, mean, std for all metrics across datasets\n",
    "## this probably is not the most pythonic way of doing this\n",
    "\n",
    "metrics_stats = {}\n",
    "for i, negative in enumerate(negatives_fp_list):\n",
    "    dataset_name = negative.name.split('.')[0]\n",
    "    # Get the stats for a particular dataset and trnaspose it\n",
    "    set_df = metrics_df.loc[metrics_df[\"TS\"] == dataset_name, _metrics].describe().T\n",
    "    \n",
    "    metrics_stats[dataset_name] = {'accuracy_mean' : set_df.loc['accuracy', 'mean'],\n",
    "                                 'accuracy_std' : set_df.loc['accuracy', 'std'],\n",
    "                                'accuracy_min' : set_df.loc['accuracy', 'min'],\n",
    "                                'accuracy_max' : set_df.loc['accuracy', 'max'],\n",
    "                                 'precision_mean' : set_df.loc['precision', 'mean'],\n",
    "                                 'precision_std' : set_df.loc['precision', 'std'],\n",
    "                                'precision_min' : set_df.loc['precision', 'min'],\n",
    "                                'precision_max' : set_df.loc['precision', 'max'],\n",
    "                                 'recall_mean' : set_df.loc['recall', 'mean'],\n",
    "                                 'recall_std' : set_df.loc['recall', 'std'],\n",
    "                                'recall_min' : set_df.loc['recall', 'min'],\n",
    "                                'recall_max' : set_df.loc['recall', 'max'],\n",
    "                                 'f1_score_mean': set_df.loc['f1_score', 'mean'],\n",
    "                                 'f1_score_std': set_df.loc['f1_score', 'std'],\n",
    "                                'f1_score_min' : set_df.loc['f1_score', 'min'],\n",
    "                                'f1_score_max' : set_df.loc['f1_score', 'max'],\n",
    "                                'roc_auc_score_mean': set_df.loc['roc_auc_score', 'mean'],\n",
    "                                 'roc_auc_score_std': set_df.loc['roc_auc_score', 'std'],\n",
    "                                'roc_auc_score_min' : set_df.loc['roc_auc_score', 'min'],\n",
    "                                'roc_auc_score_max' : set_df.loc['roc_auc_score', 'max'],\n",
    "                                'accuracy_abs_mean' : set_df.loc['accuracy_abs', 'mean'],\n",
    "                                'accuracy_abs_std' : set_df.loc['accuracy_abs', 'std'],\n",
    "                                'accuracy_abs_min' : set_df.loc['accuracy_abs', 'min'],\n",
    "                                'accuracy_abs_max' : set_df.loc['accuracy_abs', 'max']\n",
    "                               }\n",
    "\n",
    "metrics_stats_df = pd.DataFrame.from_dict(metrics_stats, orient='index')\n",
    "\n",
    "# Save them\n",
    "metrics_stats_fp = RF_dir / Path(\"metrics.stats.tsv\")\n",
    "metrics_stats_df.to_csv(metrics_stats_fp, sep='\\t',)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select the best model automatically\n",
    "\n",
    "# Initialize a dictionary {N1 : 0, N2 : 0, ...}\n",
    "dataset_scores = dict(zip(metrics_stats_df.T.columns, \n",
    "                          [0 for i in range(len(metrics_stats_df.T.columns))]))\n",
    "# Iterate over the rows of the dataframe\n",
    "for stat, row in metrics_stats_df.T.iterrows():\n",
    "    if stat.endswith('std'):\n",
    "        #row is a series. idxmax() returns the value of the index of the series\n",
    "        dataset_scores[row.idxmin()] += 1 \n",
    "    else:\n",
    "        dataset_scores[row.idxmax()] += 1\n",
    "        \n",
    "# Select best scoring model\n",
    "best_model = max(dataset_scores.items(), key=operator.itemgetter(1))[0]\n",
    "print(\"AND THE BEST MODEL IS...: {}\".format(best_model))\n",
    "\n",
    "best_model_fp = RF_dir / Path(\"best_model.pkl\")\n",
    "# Should be identical with the one in the models dir\n",
    "with open(best_model_fp, 'wb') as fout:\n",
    "    pickle.dump(best_models[best_model], fout)\n",
    "\n",
    "# Write the best model id in a file.\n",
    "## This is used internally in the workflow as a checkpoint\n",
    "# Should be a file containing just the id , e.g. N2\n",
    "best_model_id_fp = RF_dir / Path(\"best_model_id.txt\")\n",
    "with open(best_model_id_fp, 'w') as fout:\n",
    "    fout.write(best_model)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PLOTS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_dataset_roc_curve(training_set,\n",
    "                           data_df,\n",
    "                           plot_all=True,\n",
    "                           ax=None,\n",
    "                           label_xy=True):\n",
    "    \"\"\"\n",
    "    Plot the roc curve for a dataset.\n",
    "    training_set:str\n",
    "        The name of the dataset\n",
    "    data_df: pandas.Dataframe\n",
    "        A data frame that holds all metrics\n",
    "    plot_all:bool\n",
    "        If roc curves for all other datasets used for validation\n",
    "        should be plotted\n",
    "    ax:pyplot.axes\n",
    "        An axes object (used for subplotting)\n",
    "    label_xy:bool\n",
    "        If axes x,y should be annotated\n",
    "    \n",
    "    \"\"\"\n",
    "    \n",
    "    if ax is None:\n",
    "        ax = plt.gca()\n",
    "        \n",
    "    rc = ax.plot([0,1], [0,1], linestyle='--', color='k', label='Baseline')\n",
    "    \n",
    "    # Select all rows for a particular dataset used for the optimization\n",
    "    # and create a new dataframe\n",
    "    train_df = data_df.loc[data_df['TS'] == training_set]\n",
    "    # Grab the fpr and tpr arrays for that set\n",
    "    \n",
    "    fpr = train_df.loc[train_df['VS'] == training_set, 'fpr'].values\n",
    "    tpr = train_df.loc[train_df['VS'] == training_set, 'tpr'].values\n",
    "    # Get the auc_score value\n",
    "    auc_score = data_df.loc[(training_set, training_set), 'roc_auc_score']\n",
    "    # Plot the roc curve\n",
    "    ## fpr and tpr are returned as an array of arrays\n",
    "    ## with length == 1. Hence, we grab the first ellement which\n",
    "    ## is the array itself\n",
    "    rc = ax.plot(fpr[0], tpr[0], \n",
    "                 label=\"{}, AUROC={:.2f}\".format(training_set, auc_score),\n",
    "                 color = color_dict.get(training_set), \n",
    "                 linewidth=3, \n",
    "                 alpha=1)\n",
    "    # For plotting all other datasets roc curves\n",
    "    if plot_all is True:\n",
    "        for validation_set in train_df['VS'].values:\n",
    "            if validation_set == training_set:\n",
    "                pass\n",
    "            else:\n",
    "                # This is done on the subset train_df\n",
    "                fpr = train_df.loc[train_df['VS'] == validation_set, 'fpr'].values\n",
    "                tpr = train_df.loc[train_df['VS'] == validation_set, 'tpr'].values\n",
    "                auc_score = data_df.loc[(training_set, validation_set), 'roc_auc_score']\n",
    "                rc = ax.plot(fpr[0], tpr[0], \n",
    "                             label=\"{}, AUROC={:.2f}\".format(validation_set, auc_score),\n",
    "                            color = color_dict.get(validation_set),\n",
    "                             linewidth=1,\n",
    "                            alpha=0.7)\n",
    "            \n",
    "    rc = ax.legend(loc='lower right', fontsize='small')\n",
    "    \n",
    "    # When plotting multiple datasets in a subplots\n",
    "    if label_xy is True:\n",
    "        rc = ax.set_ylabel('True Positive Rate')\n",
    "        rc = ax.set_xlabel('False Positive Rate')\n",
    "    # Set the master title    \n",