main.py 13.7 KB
Newer Older
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
1
from flask import Flask, request
2
3
4
import numpy as np
from flask_cors import CORS
from time import time
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
5
import pandas as pd
6
import orjson
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
7
8
import bigwig
import bbi
9
import _ucrdtw
10
11
import _lsh
import math
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
12
13
import dask.dataframe as dd
import os.path
14
from random import sample
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
15
16
17
from DBA_multivariate import performDBA
from tslearn.metrics import dtw
from sklearn import preprocessing
18
19
20
21
22
from collections import defaultdict
from dtaidistance import dtw_ndim
from scipy.spatial.distance import euclidean

from fastdtw import fastdtw
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
23

24
reload = False
25
26
27
28
29
30
31
32
33
34

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
35
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
36
37
38
39
    size = bbi.chromsizes('test.bigWig')['chr1']
    bins = 100000
    data = bigwig.get('test.bigWig', 'chr1', 0, size, bins)
    print(data.shape)
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    response = [
        {
            "index": list(range(0, size, int(size/(bins)))),
            "values": data.tolist()
        },
        {
            "index": list(range(0, size, int(size / (bins)))),
            "values": data.tolist()
        },
        {
            "index": list(range(0, size, int(size / (bins)))),
            "values": data.tolist()
        }
    ]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
54
    response = orjson.dumps(response)
55
    print('Data read: ' + str(time()-t0))
56
57
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
58
59
60
61
62
63
64
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
        print("read file")
65
        df = df.loc[df['number_sta'].isin([14066001, 14137001, 14216001, 14372001, 22092001, 22113006, 22135001])].fillna(0)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
66
67
68
69
70
71
72
73
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
74
75
            "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[df['number_sta'] == 14066001].loc[:, 't'].values.tolist()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
76
77
        },
        {
78
79
            "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[df['number_sta'] == 14066001].loc[:, 'hu'].values.tolist()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
80
81
        },
        {
82
83
            "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[df['number_sta'] == 14066001].loc[:, 'td'].values.tolist()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
84
85
86
87
88
89
90
91
92
93
94
95
96
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        channels = list()
97
98
99
        channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 't'].fillna(0).values.tolist())
        channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 'hu'].fillna(0).values.tolist())
        channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 'td'].fillna(0).values.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
100
101
102
103
        print("Data read: " + str(time()-t0))
        # raw_data = request.json
        window_size = 120 #int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
104
        data = [([values[i:i+window_size] for values in channels]) for i in range(0, len(channels[0]) - window_size, 1)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
105
106
107
108
109
110
111
112
113
114
115
116
        print("Raw windows: " + str(time()-t0))
        windows = []
        for i in range(len(data)):
            if i % 5000 == 0:
                print(i)
            windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1))
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', windows)
    print("Sending response: " + str(time()-t0))
    return '1'


117
118
@app.route('/create-windows', methods=['POST'])
def create_windows():
119
    t0 = time()
120
121
122
123
    if (not os.path.isfile('processed-data.npy')):
        # raw_data = request.json
        # window_size = int(raw_data['parameters']["windowsize"])
        window_size = 120
124
125
126
127
128
129
130
131
        data = bigwig.chunk(
            'test.bigWig',
            12000,
            int(12000 / window_size),
            int(12000 / 6),
            ['chr1'],
            verbose=True,
        )
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        data = np.reshape(data, (len(data), 1, len(data[0])))
        data2 = np.copy(data)
        np.random.shuffle(data2)
        data3 = np.copy(data)
        np.random.shuffle(data3)

        data = np.concatenate((data, data2), axis=1)
        data = np.concatenate((data, data3), axis=1)
        # data = np.repeat(data, repeats=3, axis=1)
        np.save('processed-data', data)
    print('Windows created: ' + str(time()-t0))
    return '1'

@app.route('/create-test-windows', methods=['POST'])
def create_test_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        datafile = '21.csv'

        data = pd.read_csv(datafile, header=None)

        # and convert it to numpy array:
        npdata = np.array(data)
        print('data loaded')
        window_data = [npdata[i:i + 120, 0:5] for i in range(0, npdata.shape[0] - 120, int(120 / 8))]
        del npdata
        print('data created')
        np_window_data = np.repeat(window_data, repeats=3, axis=0)
        print(np_window_data.shape)
        del window_data
        data = np.reshape(np_window_data, (len(np_window_data), 5, len(np_window_data[0])))
        print(data.shape)
164
        np.save('processed-data', data)
165
    print('Windows created: ' + str(time()-t0))
166
    return '1'
167

168

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
169
170
171
@app.route('/initialize', methods=['POST'])
def initialize():
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
172
    raw_data = orjson.loads(request.data)
173
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
174
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
175
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
176
    query = np.reshape(query, (len(query[0]), len(query)))
177

178
    r, a, sd = preprocess(data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
179
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
180

181
182
183
184
185
186
187
188
189
    dict = defaultdict(int)
    for l in range(len(candidates)):
        for k in range(len(candidates[0])):
            for i in range(len(candidates[0][0])):
                dict[candidates[l][k][i]] += distances[l][k][i]
    sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
    average_candidates = list(sorted_dict.keys())
    average_distances = list(sorted_dict.values())

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
190
191
192
    response = {
        "hash_functions": hf.tolist(),
        "candidates": candidates.tolist(),
193
194
        "average_candidates": np.array(average_candidates).tolist(),
        "average_distances": np.array(average_distances).tolist(),
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
195
        "distances": distances.tolist(),
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
196
        "parameters": [float(r), float(a), float(sd)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
197
198
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
199
    print('LSH done: ' + str(time()-t0))
200
201
    return response

202
203
204
205
206
207
208
209
210
211
212
@app.route('/weights', methods=['POST'])
def weights():
    raw_data = orjson.loads(request.data)
    parameters = raw_data["labels"]

    # Caculate weights

    response = weights
    return response


213
214
215
216
217
@app.route('/update', methods=['POST'])
def update():
    t0 = time()
    raw_data = orjson.loads(request.data)
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
218
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
219
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
220
221
    query = np.reshape(query, (len(query[0]), len(query)))
    weights = raw_data["weights"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
222
223
    parameters = raw_data["parameters"]

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
224
    candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
225
226
227
228
229
230
    response = {
        "hash_functions": hf.tolist(),
        "distances": distances.tolist(),
        "candidates": candidates.tolist()
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
231
    print('LSH done: ' + str(time()-t0))
232
233
    return response

234
235
@app.route('/query', methods=['POST'])
def query():
236
    t0 = time()
237
    raw_data = orjson.loads(request.data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
238
239
240
241
    windowIndices = raw_data['window']
    if isinstance(windowIndices, int):
        output = np.load('processed-data.npy')[windowIndices]
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
242
243
        print("Query done: " + str(time() - t0))
        return response
244
    else:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
245
        indices = [int(index) for index, value in windowIndices.items() if value is True]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
246
        data = np.load('processed-data.npy')[indices]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
247
248
        output = performDBA(data)
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
249
250
251
252
253
254
255
256
257
        print("Query done: " + str(time()-t0))
        return response

@app.route('/window', methods=['POST'])
def window():
    t0 = time()
    raw_data = orjson.loads(request.data)
    indices = raw_data['indices']
    output = np.load('processed-data.npy')[indices]
258
    response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
259
    print("Query done: " + str(time() - t0))
260
261
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
262
263
@app.route('/table-info', methods=['POST'])
def table_info():
264
265
266
267
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
268
    prototypes = []
269
    for windows in all_windows:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
270
        actual_windows = data[windows]
271
272
273
274
        average_values = np.average(actual_windows, 0)
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
275
        prototypes.append({
276
277
278
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
279
        })
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
280
    distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
281
    response = orjson.dumps({'prototypes': prototypes, 'distances': distances})
282
    print("Averages calculated: " + str(time() - t0))
283
284
    return response

285
def preprocess(data, r=1000):
286
287
    # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
    # data = np.load('processed-data.npy')
288
    # data = np.reshape(data, (59999, 20, 120))
289
    # data = np.repeat(data, repeats=1, axis=1)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
290
291
292
    subset = []
    t0 = time()

293
294
295
296
297
298
299
300
301
302
303
304
    i = 0
    while i < len(data):
        if i % 999 == 0:
            print(str(i) + ':' + str(len(subset)))

        state = 1
        for s in subset:
            if np.linalg.norm(data[i] - data[s]) < r:
                state = 0
                break
        if state == 1:
            subset.append(i)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
305

306
307
308
309
310
311
312
313
314
315
316
        i = i + 1
        if i == 10000 and len(subset) < 10:
            r = r / 2
            subset = []
            i = 0
        if len(subset) > 200:
            r = r + r / 2
            subset = []
            i = 0

    # subset = sample(list(range(len(data))), 200)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
317

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
318
319
320
321
322
323
324
325
    dtw_distances = []
    eq_distances = []
    for i, index_1 in enumerate(subset):
        print(i)
        for j, index_2 in enumerate(subset):
            if index_1 == index_2:
                continue
            e = np.linalg.norm(data[index_1] - data[index_2])
326
327
328
            if (math.isnan(e) or e == 0):
                eq_distances.append(0.0001)
                dtw_distances.append(0.0001)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
329
                continue
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
330
            eq_distances.append(e)
331
332
            d = 0
            # d, _ = fastdtw(data[index_1], data[index_2], dist=euclidean)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
333
334
            d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120))
            # d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1]
335
            # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
336
            dtw_distances.append(d)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
337

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
338
339
340
341
342
343
344
345
346
347
    ratios = np.array(dtw_distances)/np.array(eq_distances)
    mean_dtw = np.mean(dtw_distances)
    sd_dtw = np.std(dtw_distances)
    mean_eq = np.mean(eq_distances)
    sd_eq = np.std(eq_distances)
    a = np.mean(ratios)
    sd = np.std(ratios)
    theta = mean_dtw + -2.58 * sd_dtw
    # theta = mean_eq + -2.58 * sd_eq
    r = theta / ((a-sd)*math.sqrt(120))
348
349
    if r < 0:
        r = mean_dtw / 100
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
350
    # r = theta / (math.sqrt(120))
351
352
353
354
355
356
    print('Mean: ' + str(mean_dtw))
    print('Stdev: ' + str(sd_dtw))
    print('Ratio mean: ' + str(a))
    print('Ratio stdev: ' + str(sd))
    print('Theta: ' + str(theta))
    print('r: ' + str(r))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
357
    print('Preprocessing time: ' + str(time() - t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
358
    return r, a, sd
359

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
360
def debug_test_lsh():
361
    data = np.load('processed-data.npy')
362
    # data = np.repeat(data, repeats=7, axis=1)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
363
    print(data.shape)
364
365
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
366
    r, a, sd = preprocess(data, 10)
367
    create_windows()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
368
369
370
    query_n = 1234
    t0 = time()
    query = data[query_n]
371
372
    data = data.astype('double')
    dict = defaultdict(int)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
373
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
374
    print("Calculated approximate in: " + str(time()-t0))
375
376
377
378
379
380
381
    for l in range(len(candidates)):
        for k in range(len(candidates[0])):
            for i in range(len(candidates[0][0])):
                dict[candidates[l][k][i]] += distances[l][k][i]
    sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
    candidates = list(sorted_dict.keys())

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
382
    print(candidates[0:20])
383

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
384
    t0 = time()
385
    # distances = [dtw_ndim.distance_fast(window, query) for window in data]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
386
    distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
387
    topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
388
    print("Calculated exact dtw in: " + str(time()-t0))
389
    print(topk_dtw[0:20])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
390

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
391
392
393
    # # distances_ed = [distance.euclidean(query, window) for window in data]
    # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
    #
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
394
    accuracy = 0
395
396
    for index in topk_dtw[0:20]:
        if index in candidates[0:200]:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
397
398
            accuracy += 1
    print(accuracy)
399

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
400
401
# read_mts_data()
# create_mts_windows()
402
# debug_test_lsh()