main.py 11.3 KB
Newer Older
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
1
from flask import Flask, request
2 3 4
import numpy as np
from flask_cors import CORS
from time import time
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
5
import pandas as pd
6
import orjson
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
7 8
import bigwig
import bbi
9
import _ucrdtw
10 11 12
import _lsh
import dtw
import math
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
13 14
import dask.dataframe as dd
import os.path
15
from random import sample
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
16 17 18
from DBA_multivariate import performDBA
from tslearn.metrics import dtw
from sklearn import preprocessing
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
19

20
reload = False
21 22 23 24 25 26 27 28 29 30

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
31
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
32 33 34 35
    size = bbi.chromsizes('test.bigWig')['chr1']
    bins = 100000
    data = bigwig.get('test.bigWig', 'chr1', 0, size, bins)
    print(data.shape)
36
    response = {
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
37 38
        "index": list(range(0, size, int(size/(bins)))),
        "values": data.tolist()
39
    }
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
40
    response = orjson.dumps(response)
41
    print('Data read: ' + str(time()-t0))
42 43
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 't'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'hu'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'td'].values.tolist()
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        channels = list()
        channels.append(df.loc[:, 't'].fillna(0).values.tolist())
        channels.append(df.loc[:, 'hu'].fillna(0).values.tolist())
        channels.append(df.loc[:, 'td'].fillna(0).values.tolist())
        print(np.isnan(df.loc[:, 't'].fillna(0).values.tolist()).any())
        print(np.isnan(df.loc[:, 'hu'].fillna(0).values.tolist()).any())
        print(np.isnan(df.loc[:, 'td'].fillna(0).values.tolist()).any())
        print("Data read: " + str(time()-t0))
        # raw_data = request.json
        window_size = 120 #int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [([values[i:i+window_size] for values in channels]) for i in range(0, len(channels[0]) - window_size, 5)]
        print("Raw windows: " + str(time()-t0))
        windows = []
        for i in range(len(data)):
            if i % 5000 == 0:
                print(i)
            windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1))
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', windows)
    print("Sending response: " + str(time()-t0))
    return '1'


106 107
@app.route('/create-windows', methods=['POST'])
def create_windows():
108
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
109
    if reload:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
110 111
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
112
        chromsize = bbi.chromsizes('test.bigWig')['chr1']
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
        step_size = int(12000 / 6)
        start_bps = np.arange(0, chromsize - 12000 + step_size, step_size)
        end_bps = np.arange(12000, chromsize + step_size, step_size)
        data = bigwig.chunk(
            'test.bigWig',
            12000,
            int(12000 / window_size),
            int(12000 / 6),
            ['chr1'],
            verbose=True,
        )
        # data = bbi.stackup(
        #     'test.bigWig',
        #     ['chr1'] * start_bps.size,
        #     start_bps,
        #     end_bps,
        #     bins=window_size,
        #     missing=0.0,
        #     oob=0.0,
        # )
        # data = (data - np.min(data))/np.ptp(data)
134
        np.save('processed-data', data)
135
        np.savetxt('processed-data', data, delimiter=' ', fmt='%f')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
136
        np.savetxt('query', data[80503], delimiter=' ', fmt='%f')
137
    print('Windows created: ' + str(time()-t0))
138
    return '1'
139

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
140 141 142
@app.route('/initialize', methods=['POST'])
def initialize():
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
143
    raw_data = orjson.loads(request.data)
144
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
145
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
146
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
147
    query = np.reshape(query, (len(query[0]), len(query)))
148

149
    r, a, sd = preprocess(data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
150
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
151

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
152 153 154 155
    response = {
        "hash_functions": hf.tolist(),
        "candidates": candidates.tolist(),
        "distances": distances.tolist(),
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
156
        "parameters": [float(r), float(a), float(sd)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
157 158
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
159
    print('LSH done: ' + str(time()-t0))
160 161
    return response

162 163 164 165 166 167 168 169 170 171 172
@app.route('/weights', methods=['POST'])
def weights():
    raw_data = orjson.loads(request.data)
    parameters = raw_data["labels"]

    # Caculate weights

    response = weights
    return response


173 174 175 176 177
@app.route('/update', methods=['POST'])
def update():
    t0 = time()
    raw_data = orjson.loads(request.data)
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
178
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
179
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
180 181
    query = np.reshape(query, (len(query[0]), len(query)))
    weights = raw_data["weights"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
182 183
    parameters = raw_data["parameters"]

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
184
    candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
185 186 187 188 189 190
    response = {
        "hash_functions": hf.tolist(),
        "distances": distances.tolist(),
        "candidates": candidates.tolist()
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
191
    print('LSH done: ' + str(time()-t0))
192 193
    return response

194 195
@app.route('/query', methods=['POST'])
def query():
196
    t0 = time()
197
    raw_data = orjson.loads(request.data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
198 199 200 201
    windowIndices = raw_data['window']
    if isinstance(windowIndices, int):
        output = np.load('processed-data.npy')[windowIndices]
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
202 203
        print("Query done: " + str(time() - t0))
        return response
204
    else:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
205
        indices = [int(index) for index, value in windowIndices.items() if value is True]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
206
        data = np.load('processed-data.npy')[indices]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
207 208
        output = performDBA(data)
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
209 210 211 212 213 214 215 216 217
        print("Query done: " + str(time()-t0))
        return response

@app.route('/window', methods=['POST'])
def window():
    t0 = time()
    raw_data = orjson.loads(request.data)
    indices = raw_data['indices']
    output = np.load('processed-data.npy')[indices]
218
    response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
219
    print("Query done: " + str(time() - t0))
220 221
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
222 223
@app.route('/table-info', methods=['POST'])
def table_info():
224 225 226 227
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
228
    prototypes = []
229
    for windows in all_windows:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
230
        actual_windows = data[windows]
231 232 233 234
        average_values = np.average(actual_windows, 0)
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
235
        prototypes.append({
236 237 238
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
239
        })
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
240
    distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
241
    response = orjson.dumps({'prototypes': prototypes, 'distances': distances})
242
    print("Averages calculated: " + str(time() - t0))
243 244
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
245
def preprocess(data, r=10):
246 247
    # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
    # data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
248
    data = np.array(data, dtype='double')
249 250
    # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
    # data = np.repeat(data, repeats=1, axis=1)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
251 252 253
    subset = []
    t0 = time()

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
254 255 256 257 258 259 260 261 262 263
    # for i, window in enumerate(data):
    #     if i % 10000 == 0:
    #         print(str(i) + ':' + str(len(subset)))
    #     state = 1
    #     for s in subset:
    #         if np.linalg.norm(window - data[s]) < r:
    #             state = 0
    #             break
    #     if state == 1:
    #         subset.append(i)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
264

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
265
    subset = sample(list(range(len(data))), 200)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
266

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
267 268 269 270 271 272 273 274
    dtw_distances = []
    eq_distances = []
    for i, index_1 in enumerate(subset):
        print(i)
        for j, index_2 in enumerate(subset):
            if index_1 == index_2:
                continue
            e = np.linalg.norm(data[index_1] - data[index_2])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
275 276
            if (math.isnan(e)):
                continue
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
277
            eq_distances.append(e)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
278 279
            d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120))
            # d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1]
280
            # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
281
            dtw_distances.append(d)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
282

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
283 284 285 286 287 288 289 290 291 292 293
    ratios = np.array(dtw_distances)/np.array(eq_distances)
    mean_dtw = np.mean(dtw_distances)
    sd_dtw = np.std(dtw_distances)
    mean_eq = np.mean(eq_distances)
    sd_eq = np.std(eq_distances)
    a = np.mean(ratios)
    sd = np.std(ratios)
    theta = mean_dtw + -2.58 * sd_dtw
    # theta = mean_eq + -2.58 * sd_eq
    r = theta / ((a-sd)*math.sqrt(120))
    # r = theta / (math.sqrt(120))
294 295 296 297 298 299
    print('Mean: ' + str(mean_dtw))
    print('Stdev: ' + str(sd_dtw))
    print('Ratio mean: ' + str(a))
    print('Ratio stdev: ' + str(sd))
    print('Theta: ' + str(theta))
    print('r: ' + str(r))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
300
    print('Preprocessing time: ' + str(time() - t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
301
    return r, a, sd
302

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
303
def debug_test_lsh():
304
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
305 306 307 308 309 310 311 312 313
    print(data.shape)
    data = np.reshape(data, (len(data), 120, 3))
    # data2 = np.copy(data)
    # for i in range(5):
    #     np.random.shuffle(data2)
    #     data = np.concatenate((data, data2), axis=2)
    # print(data.shape)
    # data = np.repeat(data, repeats=10, axis=2)
    r, a, sd = preprocess(data, 10)
314
    create_windows()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
315
    query_n = 1234
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
316

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
317 318
    t0 = time()
    query = data[query_n]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
319
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
320 321
    print("Calculated approximate in: " + str(time()-t0))
    print(candidates[0:20])
322

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
323 324
    t0 = time()
    distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
325
    topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
326
    print("Calculated exact dtw in: " + str(time()-t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
327
    print(topk_dtw[0:10])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
328

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
329 330 331
    # # distances_ed = [distance.euclidean(query, window) for window in data]
    # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
    #
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
332
    accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
333 334
    for index in topk_dtw[0:50]:
        if index in candidates[0:50]:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
335 336
            accuracy += 1
    print(accuracy)
337

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
338 339
# read_mts_data()
# create_mts_windows()
340
# debug_test_lsh()