run.py 3.21 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Use the graph generation methods provided by the network_discovery module
"""
import sys
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from time import time
from scipy.sparse import csr_matrix
from network_discovery import *
from dataset import *
from file_io import *

def classify(model, X, y, cv):
    """
    Parameters
    ----------
    model : sklearn classification object
    X : numpy array (n_samples, n_classes)
        The samples
    y : numpy array (n_samples,)
        The labels
    cv : sklearn cross-validator
        The object that provides train/test split indices
    """
    perf_metrics = [ metrics.accuracy_score, metrics.precision_score, metrics.recall_score,
                     metrics.f1_score, metrics.roc_auc_score ]
    avg_scores = np.zeros(len(perf_metrics))
    n_splits = cv.get_n_splits(X)

    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        scores = np.asarray([perf_metric(y_test, y_pred) for perf_metric in perf_metrics])
        avg_scores += scores

    avg_scores /= n_splits
    return list(avg_scores)

def create_feature_vector(graph):
    """Create a feature vector for the graph using well-known graph statistics"""
    awd = avg_weighted_degree(graph)
    cc = avg_cc(graph)
    mod = modularity(graph)
    apl = avg_path_length(graph)
    dens = nx.density(graph)
    return [ awd, cc, mod, apl, dens ]

def get_prediction_scores(graphs, labels):
    """
    Predict health status for different people.
    For each graph, create a feature vector out of that graph.

    Parameters
    ----------
    graphs : length-n list of networkx graphs
    labels : length-n list of binary (health) labels associated with the graphs
    """
    assert len(graphs) == len(labels)

    X = []
    for graph in graphs: # create a vector from each graph
        create_feature_vector(graph)
    X = np.asarray(X)
    y = labels

    C = 1 # logistic regression regularization parameter
    model = linear_model.LogisticRegression(C=C)
    cv = model_selection.StratifiedKFold(n_splits=10)
    scores = classify(model, X, y, cv)
    print('Average classification scores:')
    print('Accuracy: {}'.format(scores[0]))
    print('Precision: {}'.format(scores[1]))
    print('Recall: {}'.format(scores[2]))
    print('F1: {}'.format(scores[3]))
    print('AUC: {}'.format(scores[4]))

def main():
    # Here, use a randomly generated dataset
    # You can replace this with real time series data
    # Use the functions in the file_io and dataset modules to read in your data
    dataset = SyntheticDataset()
    data = dataset.gen_data(100, 1000, write_to_file=False)

    # Construct a graph using LSH, with a window size of 8
    # You can change the window size k, length of hash signatures r, and the
    # number of hash tables b
    k, r, b = 8, 1, 10
    data = binarize(data)
    t0 = time()
    print('Constructing LSH graph...')
    tables = window_lsh(abc_similarity, data, k=k, r=r, b=b)
    print('Constructed graph in {:.2f} seconds'.format(time() - t0))
    print(tables[0])

if __name__ == '__main__':
    main()