from flask import Flask, request import numpy as np from flask_cors import CORS from time import time import pandas as pd import orjson import bigwig import bbi import _ucrdtw import _lsh import math import dask.dataframe as dd import os.path from random import sample from DBA_multivariate import performDBA from tslearn.metrics import dtw from sklearn import preprocessing from collections import defaultdict from dtaidistance import dtw_ndim from scipy.spatial.distance import euclidean from fastdtw import fastdtw reload = False app = Flask(__name__) CORS(app) @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = [ { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() }, { "index": list(range(0, size, int(size / (bins)))), "values": data.tolist() }, { "index": list(range(0, size, int(size / (bins)))), "values": data.tolist() } ] response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) return response @app.route('/read-mts-data', methods=['GET']) def read_mts_data(): filename = 'data.pkl' if (not os.path.isfile(filename)): print("start") df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't', 'hu', 'td']) print("read file") df = df.loc[df['number_sta'].isin([14066001, 14137001, 14216001, 14372001, 22092001, 22113006, 22135001])].fillna(0) print("split rows") df = df.compute() df.to_pickle(filename) print("to_pandas") df = pd.read_pickle(filename) df.dropna(subset=['t'], inplace=True) response = [ { "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[df['number_sta'] == 14066001].loc[:, 't'].values.tolist() }, { "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[df['number_sta'] == 14066001].loc[:, 'hu'].values.tolist() }, { "index": df.loc[df['number_sta'] == 14066001].loc[:, 'date'].values.astype(str).tolist(), "values": df.loc[df['number_sta'] == 14066001].loc[:, 'td'].values.tolist() } ] print("response ready") response = orjson.dumps(response) return response @app.route('/create-mts-windows', methods=['POST']) def create_mts_windows(): t0 = time() if (not os.path.isfile('processed-data.npy')): filename = 'data.pkl' df = pd.read_pickle(filename) channels = list() channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 't'].fillna(0).values.tolist()) channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 'hu'].fillna(0).values.tolist()) channels.append(df.loc[df['number_sta'] == 14066001].loc[:, 'td'].fillna(0).values.tolist()) print("Data read: " + str(time()-t0)) # raw_data = request.json window_size = 120 #int(raw_data['parameters']["windowsize"]) print("Processing: " + str(time()-t0)) data = [([values[i:i+window_size] for values in channels]) for i in range(0, len(channels[0]) - window_size, 1)] print("Raw windows: " + str(time()-t0)) windows = [] for i in range(len(data)): if i % 5000 == 0: print(i) windows.append(preprocessing.minmax_scale(data[i], (-1, 1), axis=1)) print("Preprocessed: " + str(time()-t0)) np.save('processed-data', windows) # data = np.load('processed-data.npy') # data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) # r, a, sd = preprocess(data, 11.5) # np.save('parameters', np.array([r, a, sd])) print("Sending response: " + str(time()-t0)) return '1' @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if (not os.path.isfile('processed-data.npy')): # raw_data = request.json # window_size = int(raw_data['parameters']["windowsize"]) window_size = 120 data = bigwig.chunk( 'test.bigWig', 12000, int(12000 / window_size), int(12000 / 6), ['chr1'], verbose=True, ) data = np.reshape(data, (len(data), 1, len(data[0]))) data2 = np.copy(data) np.random.shuffle(data2) data3 = np.copy(data) np.random.shuffle(data3) data = np.concatenate((data, data2), axis=1) data = np.concatenate((data, data3), axis=1) # data = np.repeat(data, repeats=3, axis=1) np.save('processed-data', data) print('Windows created: ' + str(time()-t0)) return '1' @app.route('/create-test-windows', methods=['POST']) def create_test_windows(): t0 = time() if (not os.path.isfile('processed-data.npy')): datafile = '21.csv' data = pd.read_csv(datafile, header=None) # and convert it to numpy array: npdata = np.array(data) print('data loaded') window_data = [npdata[i:i + 120, 0:5] for i in range(0, npdata.shape[0] - 120, int(120 / 8))] del npdata print('data created') np_window_data = np.repeat(window_data, repeats=3, axis=0) print(np_window_data.shape) del window_data data = np.reshape(np_window_data, (len(np_window_data), 5, len(np_window_data[0]))) print(data.shape) np.save('processed-data', data) print('Windows created: ' + str(time()-t0)) return '1' @app.route('/initialize', methods=['POST']) def initialize(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.swapaxes(data, 1, 2) # data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) query = raw_data["query"] query = np.swapaxes(query, 0, 1) # query = np.reshape(query, (len(query[0]), len(query))) parameters = preprocess(data) # parameters = np.load('parameters.npy') r = parameters[0] a = parameters[1] sd = parameters[2] candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print(distances) dict = defaultdict(int) for l in range(len(candidates)): for k in range(len(candidates[0])): for i in range(len(candidates[0][0])): dict[candidates[l][k][i]] += distances[l][k][i] sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])} average_candidates = list(sorted_dict.keys()) average_distances = list(sorted_dict.values()) tables = [] samples_set = set() candidates = candidates.tolist() for l in range(len(candidates)): for k in range(len(candidates[0])): samples_set.update(candidates[l][k][0:5]) dict = defaultdict(list) length = len(distances[l][k]) median = distances[l][k][math.ceil(length/2)] stepsize = median / 10 indices = list(map(lambda x: 19 if x > median * 2 else math.floor(x / stepsize), distances[l][k])) for i in range(len(candidates[0][0])): dict[str(indices[i])].append(candidates[l][k][i]) tables.append(dict) samples = np.array(list(filter(lambda x: x in samples_set, average_candidates))).tolist() response = { "hash_functions": hf.reshape((len(candidates) * len(candidates[0]), len(query[0]))).tolist(), "candidates": candidates, "tables": tables, "distances": distances.tolist(), "samples": list(samples), "average_candidates": np.array(average_candidates).tolist(), "average_distances": np.array(average_distances).tolist(), "parameters": [float(r), float(a), float(sd)] } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/weights', methods=['POST']) def weights(): alpha = 0.2 raw_data = orjson.loads(request.data) labels = raw_data["labels"] hash_functions = raw_data["hash_functions"] query = raw_data["query"] old_weights = raw_data["weights"] data = np.load('processed-data.npy') all_good_windows = data[[[int(index) for index, value in labels.items() if value is True]]] good_distances = np.zeros(len(query)) for window in all_good_windows: for i in range(len(all_good_windows[0])): good_distances[i] += _ucrdtw.ucrdtw(query[i], window[i], 0.05, False)[1] if len(all_good_windows) != 0: good_distances = np.square(good_distances) good_distances /= np.sum(good_distances) good_distances = np.ones(len(query)) - good_distances good_distances /= np.sum(good_distances) good_distances *= len(all_good_windows[0]) good_distances = np.sqrt(good_distances) if len(hash_functions) != 0: summed_hash_functions = np.sum(hash_functions, axis=0) summed_hash_functions = np.square(summed_hash_functions) normalized_hash_functions = summed_hash_functions / np.sum(summed_hash_functions) normalized_hash_functions *= len(hash_functions[0]) if len(hash_functions) + len(all_good_windows) == 0: print("no update") new_weights = old_weights elif len(hash_functions) == 0: print("only windows") new_weights = alpha * np.array(old_weights) + (1 - alpha) * good_distances elif len(all_good_windows) == 0: print("only tables") new_weights = alpha * np.array(old_weights) + (1 - alpha) * normalized_hash_functions else: print("tables & windows") new_weights = alpha * np.array(old_weights) + 0.5 * (1-alpha) * good_distances + 0.5 * (1-alpha) * normalized_hash_functions print(new_weights) response = orjson.dumps(new_weights.tolist()) return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.swapaxes(data, 1, 2) # data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) query = raw_data["query"] query = np.swapaxes(query, 0, 1) # query = np.reshape(query, (len(query[0]), len(query))) weights = raw_data["weights"] parameters = raw_data["parameters"] candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2], weights) dict = defaultdict(int) for l in range(len(candidates)): for k in range(len(candidates[0])): for i in range(len(candidates[0][0])): dict[candidates[l][k][i]] += distances[l][k][i] sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])} average_candidates = list(sorted_dict.keys()) average_distances = list(sorted_dict.values()) tables = [] samples_set = set() candidates = candidates.tolist() for l in range(len(candidates)): for k in range(len(candidates[0])): samples_set.update(candidates[l][k][0:5]) dict = defaultdict(list) length = len(distances[l][k]) median = distances[l][k][math.ceil(length/2)] stepsize = median / 10 indices = list(map(lambda x: 19 if x > median * 2 else math.floor(x / stepsize), distances[l][k])) for i in range(len(candidates[0][0])): dict[str(indices[i])].append(candidates[l][k][i]) tables.append(dict) samples = np.array(list(filter(lambda x: x in samples_set, average_candidates))).tolist() response = { "hash_functions": hf.reshape((len(candidates) * len(candidates[0]), len(query[0]))).tolist(), "candidates": candidates, "tables": tables, "samples": list(samples), "average_candidates": np.array(average_candidates).tolist(), "average_distances": np.array(average_distances).tolist(), "distances": distances.tolist(), } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) windowIndices = raw_data['window'] if isinstance(windowIndices, int): output = np.load('processed-data.npy')[windowIndices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response else: indices = [int(index) for index, value in windowIndices.items() if value is True] data = np.load('processed-data.npy')[indices] output = performDBA(data) response = orjson.dumps(output.tolist()) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/table-info', methods=['POST']) def table_info(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') prototypes = [] for windows in all_windows: actual_windows = data[windows] average_values = np.average(actual_windows, 0) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values prototypes.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) # distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)] response = orjson.dumps({'prototypes': prototypes, 'distances': []}) print("Averages calculated: " + str(time() - t0)) return response def preprocess(data, r=10.0): # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037 # data = np.load('processed-data.npy') # data = np.reshape(data, (59999, 20, 120)) # data = np.repeat(data, repeats=1, axis=1) subset = [] t0 = time() i = 0 while i < len(data): if i % 999 == 0: print(r) print(str(i) + ':' + str(len(subset))) state = 1 for s in subset: if np.linalg.norm(data[i] - data[s]) < r: state = 0 break if state == 1: subset.append(i) i = i + 1 if i == 10000 and len(subset) < 10: r = r / 2 subset = [] i = 0 if len(subset) > 200: r = r + r / 2 subset = [] i = 0 # subset = sample(list(range(len(data))), 200) print("r = " + str(r)) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): print(i) for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = np.linalg.norm(data[index_1] - data[index_2]) if (math.isnan(e) or e == 0): eq_distances.append(0.0001) dtw_distances.append(0.0001) continue eq_distances.append(e) d = 0 # d, _ = fastdtw(data[index_1], data[index_2], dist=euclidean) d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) # d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1] # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance dtw_distances.append(d) ratios = np.array(dtw_distances)/np.array(eq_distances) mean_dtw = np.mean(dtw_distances) sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a = np.mean(ratios) sd = np.std(ratios) theta = mean_dtw + -2.58 * sd_dtw # theta = mean_eq + -2.58 * sd_eq r = theta / ((a-sd)*math.sqrt(120)) if r < 0: r = mean_dtw / 100 # r = theta / (math.sqrt(120)) print('Mean: ' + str(mean_dtw)) print('Stdev: ' + str(sd_dtw)) print('Ratio mean: ' + str(a)) print('Ratio stdev: ' + str(sd)) print('Theta: ' + str(theta)) print('r: ' + str(r)) print('Preprocessing time: ' + str(time() - t0)) return r, a, sd def debug_test_lsh(): data = np.load('processed-data.npy') # data = np.repeat(data, repeats=7, axis=1) print(data.shape) data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) r, a, sd = preprocess(data, 11.25) create_windows() query_n = 1234 t0 = time() query = data[query_n] data = data.astype('double') dict = defaultdict(int) candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print("Calculated approximate in: " + str(time()-t0)) for l in range(len(candidates)): for k in range(len(candidates[0])): for i in range(len(candidates[0][0])): dict[candidates[l][k][i]] += distances[l][k][i] sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])} candidates = list(sorted_dict.keys()) print(candidates[0:20]) t0 = time() # distances = [dtw_ndim.distance_fast(window, query) for window in data] distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data] topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) print("Calculated exact dtw in: " + str(time()-t0)) print(topk_dtw[0:20]) t0 = time() l2distances = [np.linalg.norm(window - query) for window in data] print("Calculated exact l2 in: " + str(time()-t0)) # # distances_ed = [distance.euclidean(query, window) for window in data] # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # accuracy = 0 for index in topk_dtw[0:20]: if index in candidates: accuracy += 1 print(accuracy) # read_mts_data() # create_mts_windows() # debug_test_lsh()