main.py 14.4 KB
Newer Older
1
2
3
4
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
from flask_cors import CORS
5
from collections import defaultdict, Counter
6
from time import time
7
8
9
import os.path
import json
from sklearn import preprocessing
10
import orjson
11
import dask.dataframe as dd
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
12
13
import bigwig
import bbi
14
15
from bitarray import bitarray
import _ucrdtw
16
17
import _lsh
from scipy.spatial import distance
18
from scipy.sparse import dia_matrix
19
20
21
22
23
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import dtw
import math
from random import sample
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
24
from DBA import performDBA
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
25

26
reload = False
27
28
29
30
31
32
33
34
35
36

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
37
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
38
39
40
41
    size = bbi.chromsizes('test.bigWig')['chr1']
    bins = 100000
    data = bigwig.get('test.bigWig', 'chr1', 0, size, bins)
    print(data.shape)
42
    response = {
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
43
44
        "index": list(range(0, size, int(size/(bins)))),
        "values": data.tolist()
45
    }
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
46
    response = orjson.dumps(response)
47
    print('Data read: ' + str(time()-t0))
48
49
50
51
    return response

@app.route('/create-windows', methods=['POST'])
def create_windows():
52
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
53
    if reload:
54
55
56
        # raw_data = request.json
        # window_size = int(raw_data['parameters']["windowsize"])
        window_size = 120
57
        chromsize = bbi.chromsizes('test.bigWig')['chr1']
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        step_size = int(12000 / 6)
        start_bps = np.arange(0, chromsize - 12000 + step_size, step_size)
        end_bps = np.arange(12000, chromsize + step_size, step_size)
        data = bigwig.chunk(
            'test.bigWig',
            12000,
            int(12000 / window_size),
            int(12000 / 6),
            ['chr1'],
            verbose=True,
        )
        # data = bbi.stackup(
        #     'test.bigWig',
        #     ['chr1'] * start_bps.size,
        #     start_bps,
        #     end_bps,
        #     bins=window_size,
        #     missing=0.0,
        #     oob=0.0,
        # )
        # data = (data - np.min(data))/np.ptp(data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
79
        print(data.shape)
80
        np.save('processed-data', data)
81
        np.savetxt('processed-data', data, delimiter=' ', fmt='%f')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
82
        np.savetxt('query', data[80503], delimiter=' ', fmt='%f')
83
    print('Windows created: ' + str(time()-t0))
84
    return '1'
85

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
86
87
88
@app.route('/initialize', methods=['POST'])
def initialize():
    t0 = time()
89
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
90
91
92
    data= np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
    data = np.repeat(data, repeats=5, axis=2)
93
    raw_data = orjson.loads(request.data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
94
95
96
    query = raw_data["query"]
    query = np.reshape(query, (len(query), 1))
    query = np.repeat(query, repeats=5, axis=1)
97

98
    r, a, sd = preprocess()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
99
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
100

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
101
102
103
104
105
106
107
    response = {
        "hash_functions": hf.tolist(),
        "candidates": candidates.tolist(),
        "distances": distances.tolist(),
        "parameters": [r, a, sd]
    }
    response = orjson.dumps(response)
108
    print('done: ' + str(time()-t0))
109
110
111
112
113
114
115
    return response

@app.route('/update', methods=['POST'])
def update():
    t0 = time()
    raw_data = orjson.loads(request.data)
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    data = np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
    data = np.repeat(data, repeats=5, axis=2)    # label_data = raw_data["labelData"]
    hash_functions = raw_data["hash_functions"]
    hash_functions = np.array(hash_functions, dtype='double')
    hash_functions = (hash_functions - np.min(hash_functions)) / np.ptp(hash_functions)
    hash_functions = np.reshape(hash_functions, (len(data[0]), 1))
    query = raw_data["query"]
    query = np.reshape(query, (len(query), 1))
    query = np.repeat(query, repeats=5, axis=1)
    parameters = raw_data["parameters"]

    candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2], hash_functions)
    response = {
        "hash_functions": hf.tolist(),
        "distances": distances.tolist(),
        "candidates": candidates.tolist()
    }
    response = orjson.dumps(response)
    print('done: ' + str(time()-t0))
136
137
    return response

138
139
@app.route('/query', methods=['POST'])
def query():
140
    t0 = time()
141
    raw_data = orjson.loads(request.data)
142
    window = raw_data['window']
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
143
    if isinstance(window, int):
144
        output = np.load('processed-data.npy')[window]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
145
146
147
148
149
        response = {
            "average": output.tolist(),
            "distances": []
        }
        response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
150
151
        print("Query done: " + str(time() - t0))
        return response
152
    else:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
153
154
155
156
157
158
159
160
161
162
163
164
        indices = [int(index) for index, value in window.items() if value is True]
        data = np.load('processed-data.npy')[indices]
        # average = np.sum(data, axis=0)/len(window)
        average = performDBA(data)
        # mins = np.absolute(data.min(axis=0) - average)
        # maxs = np.absolute(data.max(axis=0) - average)
        distances = np.absolute(np.sum(data, axis=0) - average * len(indices)) #np.maximum(mins, maxs)#np.array([0]*120)
        response = {
            "average": average.tolist(),
            "distances": distances.tolist()
        }
        response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
165
166
167
168
169
170
171
172
173
        print("Query done: " + str(time()-t0))
        return response

@app.route('/window', methods=['POST'])
def window():
    t0 = time()
    raw_data = orjson.loads(request.data)
    indices = raw_data['indices']
    output = np.load('processed-data.npy')[indices]
174
    response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
175
    print("Query done: " + str(time() - t0))
176
177
178
179
180
181
182
183
184
185
    return response

@app.route('/average-progress', methods=['POST'])
def average_progress():
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
    actual_windows = []
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
186
    print("Starting average progress")
187
188
189
    print("Initialized: " + str(time() - t0))
    for windows in all_windows:
        t1 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
190
        actual_windows.extend(data[windows])
191
192
193
        if len(actual_windows) == 0:
            output.append([])
            continue
194
195
196
        max_values = np.maximum.reduce(actual_windows).tolist()
        min_values = np.minimum.reduce(actual_windows).tolist()
        average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
197
        output = [({
198
199
200
            'average': average_values,
            'max': max_values,
            'min': min_values
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
201
        })] + output
202
203
204
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
    print("Averages calculated: " + str(time() - t0))
205
206
    return response

207
208
@app.route('/average-table', methods=['POST'])
def average_table():
209
210
211
212
213
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
    output = []
214
    print("Initialized: " + str(time() - t0))
215
216
    for windows in all_windows:
        t1 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
217
        actual_windows = data[windows]
218
        print(len(actual_windows))
219
220
221
222
223
224
225
        average_values = np.average(actual_windows, 0)
        # average_values = (np.sum(actual_windows, 0) / len(actual_windows))
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
        # max_values = np.maximum.reduce(actual_windows).tolist()
        # min_values = np.minimum.reduce(actual_windows).tolist()
226
        output.append({
227
228
229
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
230
        })
231
232
        print("Average calculated: " + str(time() - t1))
    response = orjson.dumps(output)
233
    print("Averages calculated: " + str(time() - t0))
234
235
236
    return response

def preprocess():
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250
    # data = np.load('processed-data.npy')
    # # data = np.array(data, dtype='double')
    # # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
    # # data = np.repeat(data, repeats=1, axis=1)
    # subset = []
    # # query = data[80503]
    # t0 = time()
    # # for i, window in enumerate(data):
    # #     print(i)
    # #     a = dtw.dtw(window, query, dist_method="Euclidean").distance
    # # print(time() - t0)
    # # print("done")
    #
    # r = 3
251
    # for i, window in enumerate(data):
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
252
253
254
255
256
257
258
259
260
    #     if i % 10000 == 0:
    #         print(str(i) + ':' + str(len(subset)))
    #     state = 1
    #     for s in subset:
    #         if np.linalg.norm(window - data[s]) < r:
    #             state = 0
    #             break
    #     if state == 1:
    #         subset.append(i)
261
    #
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
    # #
    # # subset = sample(list(range(len(data))), 50)
    # # print(subset)
    # dtw_distances = []
    # eq_distances = []
    # for i, index_1 in enumerate(subset):
    #     print(i)
    #     for j, index_2 in enumerate(subset):
    #         if index_1 == index_2:
    #             continue
    #         e = distance.euclidean(data[index_1], data[index_2])
    #         eq_distances.append(e)
    #         d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 6}).distance
    #         dtw_distances.append(d)
    #         # print(d-e)
    #         # if (e != 0):
    #         #     dtw_distances.append(d)#(dtw.dtw(data[index_1], data[index_2], keep_internals=True).distance)
    #         #     eq_distances.append(e)
    #         # else:
    #         #     dtw_distances.append(0)
    #         #     eq_distances.append(1)
283
284
285
    # ratios = np.array(dtw_distances)/np.array(eq_distances)
    # mean_dtw = np.mean(dtw_distances)
    # sd_dtw = np.std(dtw_distances)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
286
287
288
289
    # mean_eq = np.mean(eq_distances)
    # sd_eq = np.std(eq_distances)
    # # a=1
    # # sd=1
290
291
292
    # a = np.mean(ratios)
    # sd = np.std(ratios)
    # theta = mean_dtw + -2.58 * sd_dtw
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
293
294
295
    # # theta = mean_eq + -2.58 * sd_eq
    # # r = theta / ((a-sd)*math.sqrt(120))
    # r = theta / (math.sqrt(120))
296
297
    # print(mean_dtw)
    # print(sd_dtw)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
298
299
300
301
302
303
304
    # print(a)
    # print(sd)
    # print(theta)
    # print(r)
    # print(time() - t0)
    # return r, a, sd
    return 0.7044726353514034, 6.560676514065376, 1.1752680457916154
305
306
307
308
309
310

def dtw_query():
    data = np.load('processed-data.npy')
    data= np.array(data, dtype='double')
    query = data[80503]
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
311
312
313
    distances = _ucrdtw.ucrdtw(data, query, 0.05)
    print(distances)
    # distances = [_ucrdtw.ucrdtw(window, query, 0.05) for window in data]
314
315
316
317
318
319
320
321
322
323
324
    print(time() - t0)

def lsh_method(r, a, sd):
    create_windows()
    query_n = 80503
    dim = 10
    data = np.load('processed-data.npy')
    data= np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
    data = np.repeat(data, repeats=1, axis=2)
    query = data[query_n]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
325
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
326
327
    print(repr(candidates[0:10]))

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
328
329
330
331
332
    data = np.load('processed-data.npy')
    query = data[query_n]
    distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data]
    topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
    print(topk_dtw[0:10])
333

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
334
335
    # distances_ed = [distance.euclidean(query, window) for window in data]
    # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
336
337

    accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
338
339
340
341
342
343
344
345
346
347
348
349
350
    for index in topk_dtw[0:50]:
        if index in candidates[0:50]:
            accuracy += 1
    print(accuracy)

    # accuracy = 0
    # for index in topk_ed[0:20]:
    #     if index in candidates[0:20]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_ed[0:50]:
351
352
353
354
355
    #     if index in candidates[0:50]:
    #         accuracy += 1
    # print(accuracy)

    accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
356
357
    for index in topk_dtw[0:50]:
        if index in candidates[0:1000]:
358
359
360
361
            accuracy += 1
    print(accuracy)

    accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
362
363
    for index in topk_dtw[0:50]:
        if index in candidates[0:5000]:
364
365
366
            accuracy += 1
    print(accuracy)

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
    accuracy = 0
    for index in topk_dtw[0:50]:
        if index in candidates[0:10000]:
            accuracy += 1
    print(accuracy)

    accuracy = 0
    for index in topk_dtw[0:50]:
        if index in candidates[0:50000]:
            accuracy += 1
    print(accuracy)

    accuracy = 0
    for index in topk_dtw[0:50]:
        if index in candidates:
            accuracy += 1
    print(accuracy)
384

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
385
386
387
388
389
390
391
392
393
394
# create_windows()
# dtw_query()
data = np.load('processed-data.npy')
data = np.reshape(data, (len(data), len(data[0]), 1))
query = data[80503]
r, a, sd = preprocess()
lsh_method(r, a, sd)

# r, a, sd = preprocess()
# candidates, hf = _lsh.lsh(data, query, r, a, sd)
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
# r, a, sd = preprocess()
# lsh_method(r, a, sd)
# create_windows()
# query_n = 80503
# data = np.load('processed-data.npy')
# data= np.array(data, dtype='double')
# data = np.reshape(data, (len(data), len(data[0]), 1))
# data = np.repeat(data, repeats=10, axis=2)
# query = data[query_n]
# # candidates, hf = _lsh.lsh(data, query)
# # data = np.load('processed-data.npy')
# # query = data[query_n]
#
# data = np.load('processed-data.npy')
# print(_ucrdtw.ucrdtw(data[query_n], data[0], 0.05, False)[1])
#
# # l2_norm = lambda x, y: (x - y) ** 2
#
# data = np.load('processed-data.npy')
# data= np.array(data, dtype='double')
# data = np.repeat(data, repeats=1, axis=0)
# data = np.reshape(data, (int(len(data)/1), 1, len(data[0])))
# query = data[query_n]
# # distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data]
# # topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
# # print(topk_dtw[0:10])
#
# # Generate our data
# template = data[query_n]
# rt,ct = template.shape
# rq,cq = query.shape
# t0 = time()
# # Calculate the alignment vector and corresponding distance
# alignment = dtw.dtw(query, template, keep_internals=True)
# print(alignment.distance)
#
# print(time()-t0)
# np.save('topk', np.array(topk_dtw))
print('done')
# topk_dtw = np.load('topk.npy')
# distances_ed = [distance.euclidean(query, window) for window in data]
# topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])

#
#
# accuracy = 0
# for index in topk_dtw[0:50]:
#     if index in candidates[0:50]:
#         accuracy += 1
# print(accuracy)
# accuracy = 0
# output = []
# for index in topk_ed[0:50]:
#     if index in candidates:
#         accuracy += 1
# print(accuracy)
# accuracy = 0
# for index in topk_ed[0:50]:
#     if index in candidates[0:50]:
#         accuracy += 1
# print(accuracy)
# accuracy = 0
# for index in topk_ed[0:20]:
#     if index in candidates[0:20]:
#         accuracy += 1
# print(accuracy)