main.py 11.6 KB
Newer Older
1
2
3
4
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
from flask_cors import CORS
5
from collections import defaultdict, Counter
6
from time import time
7
8
9
import os.path
import json
from sklearn import preprocessing
10
import orjson
11
import dask.dataframe as dd
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
12
13
import bigwig
import bbi
14
15
from bitarray import bitarray
import _ucrdtw
16
17
import _lsh
from scipy.spatial import distance
18
from scipy.sparse import dia_matrix
19
20
21
22
23
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import dtw
import math
from random import sample
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
24
from DBA import performDBA
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
25

26
reload = False
27
28
29
30
31
32
33
34
35
36

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
37
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
38
39
40
41
    size = bbi.chromsizes('test.bigWig')['chr1']
    bins = 100000
    data = bigwig.get('test.bigWig', 'chr1', 0, size, bins)
    print(data.shape)
42
    response = {
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
43
44
        "index": list(range(0, size, int(size/(bins)))),
        "values": data.tolist()
45
    }
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
46
    response = orjson.dumps(response)
47
    print('Data read: ' + str(time()-t0))
48
49
50
51
    return response

@app.route('/create-windows', methods=['POST'])
def create_windows():
52
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
53
    if reload:
54
55
56
        # raw_data = request.json
        # window_size = int(raw_data['parameters']["windowsize"])
        window_size = 120
57
        chromsize = bbi.chromsizes('test.bigWig')['chr1']
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        step_size = int(12000 / 6)
        start_bps = np.arange(0, chromsize - 12000 + step_size, step_size)
        end_bps = np.arange(12000, chromsize + step_size, step_size)
        data = bigwig.chunk(
            'test.bigWig',
            12000,
            int(12000 / window_size),
            int(12000 / 6),
            ['chr1'],
            verbose=True,
        )
        # data = bbi.stackup(
        #     'test.bigWig',
        #     ['chr1'] * start_bps.size,
        #     start_bps,
        #     end_bps,
        #     bins=window_size,
        #     missing=0.0,
        #     oob=0.0,
        # )
        # data = (data - np.min(data))/np.ptp(data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
79
        print(data.shape)
80
        np.save('processed-data', data)
81
        np.savetxt('processed-data', data, delimiter=' ', fmt='%f')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
82
        np.savetxt('query', data[80503], delimiter=' ', fmt='%f')
83
    print('Windows created: ' + str(time()-t0))
84
    return '1'
85

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
86
87
88
@app.route('/initialize', methods=['POST'])
def initialize():
    t0 = time()
89
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
90
91
    data= np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
92
    data = np.repeat(data, repeats=1, axis=2)
93
    raw_data = orjson.loads(request.data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
94
95
    query = raw_data["query"]
    query = np.reshape(query, (len(query), 1))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
96
    query = np.repeat(query, repeats=1, axis=1)
97

98
    r, a, sd = preprocess()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
99
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
100

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
101
102
103
104
    response = {
        "hash_functions": hf.tolist(),
        "candidates": candidates.tolist(),
        "distances": distances.tolist(),
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
105
        "parameters": [float(r), float(a), float(sd)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
106
107
    }
    response = orjson.dumps(response)
108
    print('done: ' + str(time()-t0))
109
110
111
112
113
114
115
    return response

@app.route('/update', methods=['POST'])
def update():
    t0 = time()
    raw_data = orjson.loads(request.data)
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
116
117
    data = np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
118
    data = np.repeat(data, repeats=1, axis=2)    # label_data = raw_data["labelData"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
119
120
121
122
123
124
    hash_functions = raw_data["hash_functions"]
    hash_functions = np.array(hash_functions, dtype='double')
    hash_functions = (hash_functions - np.min(hash_functions)) / np.ptp(hash_functions)
    hash_functions = np.reshape(hash_functions, (len(data[0]), 1))
    query = raw_data["query"]
    query = np.reshape(query, (len(query), 1))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
125
    query = np.repeat(query, repeats=1, axis=1)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
126
127
128
129
130
131
132
133
134
135
    parameters = raw_data["parameters"]

    candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2], hash_functions)
    response = {
        "hash_functions": hf.tolist(),
        "distances": distances.tolist(),
        "candidates": candidates.tolist()
    }
    response = orjson.dumps(response)
    print('done: ' + str(time()-t0))
136
137
    return response

138
139
@app.route('/query', methods=['POST'])
def query():
140
    t0 = time()
141
    raw_data = orjson.loads(request.data)
142
    window = raw_data['window']
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
143
    if isinstance(window, int):
144
        output = np.load('processed-data.npy')[window]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
145
146
147
148
149
        response = {
            "average": output.tolist(),
            "distances": []
        }
        response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
150
151
        print("Query done: " + str(time() - t0))
        return response
152
    else:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
153
154
155
156
157
158
159
160
161
162
163
164
        indices = [int(index) for index, value in window.items() if value is True]
        data = np.load('processed-data.npy')[indices]
        # average = np.sum(data, axis=0)/len(window)
        average = performDBA(data)
        # mins = np.absolute(data.min(axis=0) - average)
        # maxs = np.absolute(data.max(axis=0) - average)
        distances = np.absolute(np.sum(data, axis=0) - average * len(indices)) #np.maximum(mins, maxs)#np.array([0]*120)
        response = {
            "average": average.tolist(),
            "distances": distances.tolist()
        }
        response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
165
166
167
168
169
170
171
172
173
        print("Query done: " + str(time()-t0))
        return response

@app.route('/window', methods=['POST'])
def window():
    t0 = time()
    raw_data = orjson.loads(request.data)
    indices = raw_data['indices']
    output = np.load('processed-data.npy')[indices]
174
    response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
175
    print("Query done: " + str(time() - t0))
176
177
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
178
179
@app.route('/average', methods=['POST'])
def average():
180
181
182
183
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
184
    averages = []
185
    print("Initialized: " + str(time() - t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
186
    print(len(all_windows))
187
188
    for windows in all_windows:
        t1 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
189
        actual_windows = data[windows]
190
        print(len(actual_windows))
191
192
193
194
195
196
197
        average_values = np.average(actual_windows, 0)
        # average_values = (np.sum(actual_windows, 0) / len(actual_windows))
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
        # max_values = np.maximum.reduce(actual_windows).tolist()
        # min_values = np.minimum.reduce(actual_windows).tolist()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
198
        averages.append({
199
200
201
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
202
        })
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
203
204
    distances = [[_ucrdtw.ucrdtw(np.array(v["average"]), np.array(w["average"]), 0.05 * 120, False)[1] for j, w in enumerate(averages)] for i, v in enumerate(averages)]
    response = orjson.dumps({'averages': averages, 'distances': distances})
205
    print("Averages calculated: " + str(time() - t0))
206
207
208
    return response

def preprocess():
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
209
210
211
212
213
214
215
216
    return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
    data = np.load('processed-data.npy')
    data = np.array(data, dtype='double')
    data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
    data = np.repeat(data, repeats=1, axis=1)
    subset = []
    # query = data[80503]
    t0 = time()
217
    # for i, window in enumerate(data):
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
218
    #     print(i)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
219
    #     a = dtw.dtw(window, query, dist_method="Euclidean").distance
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
220
    # print(time() - t0)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
    # print("done")

    r = 3
    for i, window in enumerate(data):
        if i % 10000 == 0:
            print(str(i) + ':' + str(len(subset)))
        state = 1
        for s in subset:
            if np.linalg.norm(window - data[s]) < r:
                state = 0
                break
        if state == 1:
            subset.append(i)

    #
    # subset = sample(list(range(len(data))), 50)
    # print(subset)
    dtw_distances = []
    eq_distances = []
    for i, index_1 in enumerate(subset):
        print(i)
        for j, index_2 in enumerate(subset):
            if index_1 == index_2:
                continue
            e = np.linalg.norm(data[index_1] - data[index_2])
            eq_distances.append(e)
            d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
            dtw_distances.append(d)
    ratios = np.array(dtw_distances)/np.array(eq_distances)
    mean_dtw = np.mean(dtw_distances)
    sd_dtw = np.std(dtw_distances)
    mean_eq = np.mean(eq_distances)
    sd_eq = np.std(eq_distances)
    a = np.mean(ratios)
    sd = np.std(ratios)
    theta = mean_dtw + -2.58 * sd_dtw
    # theta = mean_eq + -2.58 * sd_eq
    r = theta / ((a-sd)*math.sqrt(120))
    # r = theta / (math.sqrt(120))
    print(mean_dtw)
    print(sd_dtw)
    print(a)
    print(sd)
    print(theta)
    print(r)
    print(time() - t0)
    return r, a, sd
268
269
270
271
272
273

def dtw_query():
    data = np.load('processed-data.npy')
    data= np.array(data, dtype='double')
    query = data[80503]
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
274
275
276
    distances = _ucrdtw.ucrdtw(data, query, 0.05)
    print(distances)
    # distances = [_ucrdtw.ucrdtw(window, query, 0.05) for window in data]
277
278
279
280
281
282
    print(time() - t0)

def lsh_method(r, a, sd):
    create_windows()
    query_n = 80503
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
283
284
    query = performDBA(data[[80503, 11514]])
    query = np.reshape(query, (len(data[0]), 1))
285
286
287
    data= np.array(data, dtype='double')
    data = np.reshape(data, (len(data), len(data[0]), 1))
    data = np.repeat(data, repeats=1, axis=2)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
288
289
    # query = data[query_n]

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
290
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
291
292
293
294
    print(repr(candidates[0:20]))
    print(distances[0:10])
    print(np.where(candidates == 80503))
    print(np.where(candidates == 11514))
295

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
296
297
    data = np.load('processed-data.npy')
    query = data[query_n]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
298
    distances = [_ucrdtw.ucrdtw(window, query, 0.05 * 120, False)[1] for window in data]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
299
300
    topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
    print(topk_dtw[0:10])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
301
302
303
304
305
306
307
    #
    # for candidate in candidates[0:20]:
    #     print(_ucrdtw.ucrdtw(data[candidate], query, 0.05, False)[1])
    #
    # # distances_ed = [distance.euclidean(query, window) for window in data]
    # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
    #
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
308
    # accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
309
    # for index in topk_dtw[0:20]:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
310
311
312
313
314
    #     if index in candidates[0:20]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
315
    # for index in topk_dtw[0:20]:
316
317
318
    #     if index in candidates[0:50]:
    #         accuracy += 1
    # print(accuracy)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
    #
    # # accuracy = 0
    # # for index in topk_ed[0:20]:
    # #     if index in candidates[0:20]:
    # #         accuracy += 1
    # # print(accuracy)
    # #
    # # accuracy = 0
    # # for index in topk_ed[0:50]:
    # #     if index in candidates[0:50]:
    # #         accuracy += 1
    # # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_dtw[0:50]:
    #     if index in candidates[0:1000]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_dtw[0:50]:
    #     if index in candidates[0:5000]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_dtw[0:50]:
    #     if index in candidates[0:10000]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_dtw[0:50]:
    #     if index in candidates[0:50000]:
    #         accuracy += 1
    # print(accuracy)
    #
    # accuracy = 0
    # for index in topk_dtw[0:50]:
    #     if index in candidates:
    #         accuracy += 1
    # print(accuracy)
361
362

# r, a, sd = preprocess()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
363
# lsh_method(r, a, sd)