from flask import Flask, request import numpy as np from flask_cors import CORS from time import time import pandas as pd import orjson import bigwig import bbi import _ucrdtw import _lsh import dtw import math import dask.dataframe as dd import os.path from random import sample from DBA_multivariate import performDBA from tslearn.metrics import dtw from sklearn import preprocessing reload = False app = Flask(__name__) CORS(app) @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() } response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) return response @app.route('/read-mts-data', methods=['GET']) def read_mts_data(): filename = 'data.pkl' if (not os.path.isfile(filename)): print("start") df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td']) print("read file") df = df.loc[df['number_sta'] == 14066001] print("split rows") df = df.compute() df.to_pickle(filename) print("to_pandas") df = pd.read_pickle(filename) df.dropna(subset=['t'], inplace=True) response = [ { "index": df.loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[:, 't'].values.tolist() }, { "index": df.loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[:, 'hu'].values.tolist() }, { "index": df.loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[:, 'td'].values.tolist() } ] print("response ready") response = orjson.dumps(response) return response @app.route('/create-mts-windows', methods=['POST']) def create_mts_windows(): t0 = time() if (not os.path.isfile('processed-data.npy')): filename = 'data.pkl' df = pd.read_pickle(filename) channels = list() channels.append(df.loc[:, 't'].fillna(0).values.tolist()) channels.append(df.loc[:, 'hu'].fillna(0).values.tolist()) channels.append(df.loc[:, 'td'].fillna(0).values.tolist()) print(np.isnan(df.loc[:, 't'].fillna(0).values.tolist()).any()) print(np.isnan(df.loc[:, 'hu'].fillna(0).values.tolist()).any()) print(np.isnan(df.loc[:, 'td'].fillna(0).values.tolist()).any()) print("Data read: " + str(time()-t0)) # raw_data = request.json window_size = 120 #int(raw_data['parameters']["windowsize"]) print("Processing: " + str(time()-t0)) data = [([values[i:i+window_size] for values in channels]) for i in range(0, len(channels[0]) - window_size, 5)] print("Raw windows: " + str(time()-t0)) windows = [] for i in range(len(data)): if i % 5000 == 0: print(i) windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1)) print("Preprocessed: " + str(time()-t0)) np.save('processed-data', windows) print("Sending response: " + str(time()-t0)) return '1' @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if reload: raw_data = request.json window_size = int(raw_data['parameters']["windowsize"]) chromsize = bbi.chromsizes('test.bigWig')['chr1'] step_size = int(12000 / 6) start_bps = np.arange(0, chromsize - 12000 + step_size, step_size) end_bps = np.arange(12000, chromsize + step_size, step_size) data = bigwig.chunk( 'test.bigWig', 12000, int(12000 / window_size), int(12000 / 6), ['chr1'], verbose=True, ) # data = bbi.stackup( # 'test.bigWig', # ['chr1'] * start_bps.size, # start_bps, # end_bps, # bins=window_size, # missing=0.0, # oob=0.0, # ) # data = (data - np.min(data))/np.ptp(data) np.save('processed-data', data) np.savetxt('processed-data', data, delimiter=' ', fmt='%f') np.savetxt('query', data[80503], delimiter=' ', fmt='%f') print('Windows created: ' + str(time()-t0)) return '1' @app.route('/initialize', methods=['POST']) def initialize(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) query = raw_data["query"] query = np.reshape(query, (len(query[0]), len(query))) r, a, sd = preprocess(data) candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) response = { "hash_functions": hf.tolist(), "candidates": candidates.tolist(), "distances": distances.tolist(), "parameters": [float(r), float(a), float(sd)] } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/weights', methods=['POST']) def weights(): raw_data = orjson.loads(request.data) parameters = raw_data["labels"] # Caculate weights response = weights return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) query = raw_data["query"] query = np.reshape(query, (len(query[0]), len(query))) weights = raw_data["weights"] parameters = raw_data["parameters"] candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2]) response = { "hash_functions": hf.tolist(), "distances": distances.tolist(), "candidates": candidates.tolist() } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) windowIndices = raw_data['window'] if isinstance(windowIndices, int): output = np.load('processed-data.npy')[windowIndices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response else: indices = [int(index) for index, value in windowIndices.items() if value is True] data = np.load('processed-data.npy')[indices] output = performDBA(data) response = orjson.dumps(output.tolist()) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/table-info', methods=['POST']) def table_info(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') prototypes = [] for windows in all_windows: actual_windows = data[windows] average_values = np.average(actual_windows, 0) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values prototypes.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)] response = orjson.dumps({'prototypes': prototypes, 'distances': distances}) print("Averages calculated: " + str(time() - t0)) return response def preprocess(data, r=10): # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037 # data = np.load('processed-data.npy') data = np.array(data, dtype='double') # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0]))) # data = np.repeat(data, repeats=1, axis=1) subset = [] t0 = time() # for i, window in enumerate(data): # if i % 10000 == 0: # print(str(i) + ':' + str(len(subset))) # state = 1 # for s in subset: # if np.linalg.norm(window - data[s]) < r: # state = 0 # break # if state == 1: # subset.append(i) subset = sample(list(range(len(data))), 200) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): print(i) for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = np.linalg.norm(data[index_1] - data[index_2]) if (math.isnan(e)): continue eq_distances.append(e) d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) # d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1] # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance dtw_distances.append(d) ratios = np.array(dtw_distances)/np.array(eq_distances) mean_dtw = np.mean(dtw_distances) sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a = np.mean(ratios) sd = np.std(ratios) theta = mean_dtw + -2.58 * sd_dtw # theta = mean_eq + -2.58 * sd_eq r = theta / ((a-sd)*math.sqrt(120)) # r = theta / (math.sqrt(120)) print('Mean: ' + str(mean_dtw)) print('Stdev: ' + str(sd_dtw)) print('Ratio mean: ' + str(a)) print('Ratio stdev: ' + str(sd)) print('Theta: ' + str(theta)) print('r: ' + str(r)) print('Preprocessing time: ' + str(time() - t0)) return r, a, sd def debug_test_lsh(): data = np.load('processed-data.npy') print(data.shape) data = np.reshape(data, (len(data), 120, 3)) # data2 = np.copy(data) # for i in range(5): # np.random.shuffle(data2) # data = np.concatenate((data, data2), axis=2) # print(data.shape) # data = np.repeat(data, repeats=10, axis=2) r, a, sd = preprocess(data, 10) create_windows() query_n = 1234 t0 = time() query = data[query_n] candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print("Calculated approximate in: " + str(time()-t0)) print(candidates[0:20]) t0 = time() distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data] topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) print("Calculated exact dtw in: " + str(time()-t0)) print(topk_dtw[0:10]) # # distances_ed = [distance.euclidean(query, window) for window in data] # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # accuracy = 0 for index in topk_dtw[0:50]: if index in candidates[0:50]: accuracy += 1 print(accuracy) # read_mts_data() # create_mts_windows() # debug_test_lsh()