main.py 11.3 KB
Newer Older
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
1
from flask import Flask, request
2
3
4
import numpy as np
from flask_cors import CORS
from time import time
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
5
import pandas as pd
6
import orjson
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
7
8
import bigwig
import bbi
9
import _ucrdtw
10
11
12
import _lsh
import dtw
import math
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
13
14
import dask.dataframe as dd
import os.path
15
from random import sample
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
16
17
18
from DBA_multivariate import performDBA
from tslearn.metrics import dtw
from sklearn import preprocessing
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
19

20
reload = False
21
22
23
24
25
26
27
28
29
30

app = Flask(__name__)
CORS(app)

@app.route('/', methods=['GET'])
def index():
    return "hi"

@app.route('/read-data', methods=['GET'])
def read_data():
31
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
32
33
34
35
    size = bbi.chromsizes('test.bigWig')['chr1']
    bins = 100000
    data = bigwig.get('test.bigWig', 'chr1', 0, size, bins)
    print(data.shape)
36
    response = {
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
37
38
        "index": list(range(0, size, int(size/(bins)))),
        "values": data.tolist()
39
    }
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
40
    response = orjson.dumps(response)
41
    print('Data read: ' + str(time()-t0))
42
43
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
@app.route('/read-mts-data', methods=['GET'])
def read_mts_data():
    filename = 'data.pkl'
    if (not os.path.isfile(filename)):
        print("start")
        df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td'])
        print("read file")
        df = df.loc[df['number_sta'] == 14066001]
        print("split rows")
        df = df.compute()
        df.to_pickle(filename)
        print("to_pandas")
    df = pd.read_pickle(filename)
    df.dropna(subset=['t'], inplace=True)
    response = [
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 't'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'hu'].values.tolist()
        },
        {
            "index": df.loc[:, 'date'].values.astype(str).tolist(),
            "values": df.loc[:, 'td'].values.tolist()
        }
    ]
    print("response ready")
    response = orjson.dumps(response)
    return response

@app.route('/create-mts-windows', methods=['POST'])
def create_mts_windows():
    t0 = time()
    if (not os.path.isfile('processed-data.npy')):
        filename = 'data.pkl'
        df = pd.read_pickle(filename)
        channels = list()
        channels.append(df.loc[:, 't'].fillna(0).values.tolist())
        channels.append(df.loc[:, 'hu'].fillna(0).values.tolist())
        channels.append(df.loc[:, 'td'].fillna(0).values.tolist())
        print(np.isnan(df.loc[:, 't'].fillna(0).values.tolist()).any())
        print(np.isnan(df.loc[:, 'hu'].fillna(0).values.tolist()).any())
        print(np.isnan(df.loc[:, 'td'].fillna(0).values.tolist()).any())
        print("Data read: " + str(time()-t0))
        # raw_data = request.json
        window_size = 120 #int(raw_data['parameters']["windowsize"])
        print("Processing: " + str(time()-t0))
        data = [([values[i:i+window_size] for values in channels]) for i in range(0, len(channels[0]) - window_size, 5)]
        print("Raw windows: " + str(time()-t0))
        windows = []
        for i in range(len(data)):
            if i % 5000 == 0:
                print(i)
            windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1))
        print("Preprocessed: " + str(time()-t0))
        np.save('processed-data', windows)
    print("Sending response: " + str(time()-t0))
    return '1'


106
107
@app.route('/create-windows', methods=['POST'])
def create_windows():
108
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
109
    if reload:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
110
111
        raw_data = request.json
        window_size = int(raw_data['parameters']["windowsize"])
112
        chromsize = bbi.chromsizes('test.bigWig')['chr1']
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
        step_size = int(12000 / 6)
        start_bps = np.arange(0, chromsize - 12000 + step_size, step_size)
        end_bps = np.arange(12000, chromsize + step_size, step_size)
        data = bigwig.chunk(
            'test.bigWig',
            12000,
            int(12000 / window_size),
            int(12000 / 6),
            ['chr1'],
            verbose=True,
        )
        # data = bbi.stackup(
        #     'test.bigWig',
        #     ['chr1'] * start_bps.size,
        #     start_bps,
        #     end_bps,
        #     bins=window_size,
        #     missing=0.0,
        #     oob=0.0,
        # )
        # data = (data - np.min(data))/np.ptp(data)
134
        np.save('processed-data', data)
135
        np.savetxt('processed-data', data, delimiter=' ', fmt='%f')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
136
        np.savetxt('query', data[80503], delimiter=' ', fmt='%f')
137
    print('Windows created: ' + str(time()-t0))
138
    return '1'
139

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
140
141
142
@app.route('/initialize', methods=['POST'])
def initialize():
    t0 = time()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
143
    raw_data = orjson.loads(request.data)
144
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
145
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
146
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
147
    query = np.reshape(query, (len(query[0]), len(query)))
148

149
    r, a, sd = preprocess(data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
150
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
151

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
152
153
154
155
    response = {
        "hash_functions": hf.tolist(),
        "candidates": candidates.tolist(),
        "distances": distances.tolist(),
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
156
        "parameters": [float(r), float(a), float(sd)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
157
158
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
159
    print('LSH done: ' + str(time()-t0))
160
161
    return response

162
163
164
165
166
167
168
169
170
171
172
@app.route('/weights', methods=['POST'])
def weights():
    raw_data = orjson.loads(request.data)
    parameters = raw_data["labels"]

    # Caculate weights

    response = weights
    return response


173
174
175
176
177
@app.route('/update', methods=['POST'])
def update():
    t0 = time()
    raw_data = orjson.loads(request.data)
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
178
    data = np.reshape(data, (len(data), len(data[0][0]), len(data[0])))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
179
    query = raw_data["query"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
180
181
    query = np.reshape(query, (len(query[0]), len(query)))
    weights = raw_data["weights"]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
182
183
    parameters = raw_data["parameters"]

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
184
    candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
185
186
187
188
189
190
    response = {
        "hash_functions": hf.tolist(),
        "distances": distances.tolist(),
        "candidates": candidates.tolist()
    }
    response = orjson.dumps(response)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
191
    print('LSH done: ' + str(time()-t0))
192
193
    return response

194
195
@app.route('/query', methods=['POST'])
def query():
196
    t0 = time()
197
    raw_data = orjson.loads(request.data)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
198
199
200
201
    windowIndices = raw_data['window']
    if isinstance(windowIndices, int):
        output = np.load('processed-data.npy')[windowIndices]
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
202
203
        print("Query done: " + str(time() - t0))
        return response
204
    else:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
205
        indices = [int(index) for index, value in windowIndices.items() if value is True]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
206
        data = np.load('processed-data.npy')[indices]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
207
208
        output = performDBA(data)
        response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
209
210
211
212
213
214
215
216
217
        print("Query done: " + str(time()-t0))
        return response

@app.route('/window', methods=['POST'])
def window():
    t0 = time()
    raw_data = orjson.loads(request.data)
    indices = raw_data['indices']
    output = np.load('processed-data.npy')[indices]
218
    response = orjson.dumps(output.tolist())
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
219
    print("Query done: " + str(time() - t0))
220
221
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
222
223
@app.route('/table-info', methods=['POST'])
def table_info():
224
225
226
227
    t0 = time()
    raw_data = orjson.loads(request.data)
    all_windows = raw_data['windows']
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
228
    prototypes = []
229
    for windows in all_windows:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
230
        actual_windows = data[windows]
231
232
233
234
        average_values = np.average(actual_windows, 0)
        std_values = np.std(actual_windows, 0)
        max_values = average_values + std_values
        min_values = average_values - std_values
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
235
        prototypes.append({
236
237
238
            'average': average_values.tolist(),
            'max': max_values.tolist(),
            'min': min_values.tolist()
239
        })
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
240
    distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
241
    response = orjson.dumps({'prototypes': prototypes, 'distances': distances})
242
    print("Averages calculated: " + str(time() - t0))
243
244
    return response

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
245
def preprocess(data, r=10):
246
247
    # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
    # data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
248
    data = np.array(data, dtype='double')
249
250
    # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
    # data = np.repeat(data, repeats=1, axis=1)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
251
252
253
    subset = []
    t0 = time()

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
254
255
256
257
258
259
260
261
262
263
    # for i, window in enumerate(data):
    #     if i % 10000 == 0:
    #         print(str(i) + ':' + str(len(subset)))
    #     state = 1
    #     for s in subset:
    #         if np.linalg.norm(window - data[s]) < r:
    #             state = 0
    #             break
    #     if state == 1:
    #         subset.append(i)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
264

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
265
    subset = sample(list(range(len(data))), 200)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
266

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
267
268
269
270
271
272
273
274
    dtw_distances = []
    eq_distances = []
    for i, index_1 in enumerate(subset):
        print(i)
        for j, index_2 in enumerate(subset):
            if index_1 == index_2:
                continue
            e = np.linalg.norm(data[index_1] - data[index_2])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
275
276
            if (math.isnan(e)):
                continue
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
277
            eq_distances.append(e)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
278
279
            d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120))
            # d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1]
280
            # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
281
            dtw_distances.append(d)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
282

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
283
284
285
286
287
288
289
290
291
292
293
    ratios = np.array(dtw_distances)/np.array(eq_distances)
    mean_dtw = np.mean(dtw_distances)
    sd_dtw = np.std(dtw_distances)
    mean_eq = np.mean(eq_distances)
    sd_eq = np.std(eq_distances)
    a = np.mean(ratios)
    sd = np.std(ratios)
    theta = mean_dtw + -2.58 * sd_dtw
    # theta = mean_eq + -2.58 * sd_eq
    r = theta / ((a-sd)*math.sqrt(120))
    # r = theta / (math.sqrt(120))
294
295
296
297
298
299
    print('Mean: ' + str(mean_dtw))
    print('Stdev: ' + str(sd_dtw))
    print('Ratio mean: ' + str(a))
    print('Ratio stdev: ' + str(sd))
    print('Theta: ' + str(theta))
    print('r: ' + str(r))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
300
    print('Preprocessing time: ' + str(time() - t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
301
    return r, a, sd
302

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
303
def debug_test_lsh():
304
    data = np.load('processed-data.npy')
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
305
306
307
308
309
310
311
312
313
    print(data.shape)
    data = np.reshape(data, (len(data), 120, 3))
    # data2 = np.copy(data)
    # for i in range(5):
    #     np.random.shuffle(data2)
    #     data = np.concatenate((data, data2), axis=2)
    # print(data.shape)
    # data = np.repeat(data, repeats=10, axis=2)
    r, a, sd = preprocess(data, 10)
314
    create_windows()
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
315
    query_n = 1234
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
316

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
317
318
    t0 = time()
    query = data[query_n]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
319
    candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
320
321
    print("Calculated approximate in: " + str(time()-t0))
    print(candidates[0:20])
322

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
323
324
    t0 = time()
    distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data]
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
325
    topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
326
    print("Calculated exact dtw in: " + str(time()-t0))
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
327
    print(topk_dtw[0:10])
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
328

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
329
330
331
    # # distances_ed = [distance.euclidean(query, window) for window in data]
    # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
    #
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
332
    accuracy = 0
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
333
334
    for index in topk_dtw[0:50]:
        if index in candidates[0:50]:
Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
335
336
            accuracy += 1
    print(accuracy)
337

Kruyff,D.L.W. (Dylan)'s avatar
Kruyff,D.L.W. (Dylan) committed
338
339
# read_mts_data()
# create_mts_windows()
340
# debug_test_lsh()