from flask import Flask, request import numpy as np from flask_cors import CORS from time import time import orjson import bigwig import bbi import _ucrdtw import _lsh import dtw import math from random import sample from DBA import performDBA reload = False app = Flask(__name__) CORS(app) @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() } response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) return response @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if reload: raw_data = request.json window_size = int(raw_data['parameters']["windowsize"]) chromsize = bbi.chromsizes('test.bigWig')['chr1'] step_size = int(12000 / 6) start_bps = np.arange(0, chromsize - 12000 + step_size, step_size) end_bps = np.arange(12000, chromsize + step_size, step_size) data = bigwig.chunk( 'test.bigWig', 12000, int(12000 / window_size), int(12000 / 6), ['chr1'], verbose=True, ) # data = bbi.stackup( # 'test.bigWig', # ['chr1'] * start_bps.size, # start_bps, # end_bps, # bins=window_size, # missing=0.0, # oob=0.0, # ) # data = (data - np.min(data))/np.ptp(data) np.save('processed-data', data) np.savetxt('processed-data', data, delimiter=' ', fmt='%f') np.savetxt('query', data[80503], delimiter=' ', fmt='%f') print('Windows created: ' + str(time()-t0)) return '1' @app.route('/initialize', methods=['POST']) def initialize(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.reshape(data, (len(data), len(data[0]), 1)) # data = np.repeat(data, repeats=1, axis=2) query = raw_data["query"] query = np.reshape(query, (len(query), 1)) # query = np.repeat(query, repeats=1, axis=1) r, a, sd = preprocess(data) candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) response = { "hash_functions": hf.tolist(), "candidates": candidates.tolist(), "distances": distances.tolist(), "parameters": [float(r), float(a), float(sd)] } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.reshape(data, (len(data), len(data[0]), 1)) # data = np.repeat(data, repeats=1, axis=2) weights = raw_data["weights"] query = raw_data["query"] query = np.reshape(query, (len(query), 1)) # query = np.repeat(query, repeats=1, axis=1) parameters = raw_data["parameters"] candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2]) response = { "hash_functions": hf.tolist(), "distances": distances.tolist(), "candidates": candidates.tolist() } response = orjson.dumps(response) print('LSH done: ' + str(time()-t0)) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) windowIndices = raw_data['window'] if isinstance(windowIndices, int): output = np.load('processed-data.npy')[windowIndices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response else: indices = [int(index) for index, value in windowIndices.items() if value is True] data = np.load('processed-data.npy')[indices] output = performDBA(data) response = orjson.dumps(output.tolist()) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/table-info', methods=['POST']) def table_info(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') prototypes = [] for windows in all_windows: actual_windows = data[windows] average_values = np.average(actual_windows, 0) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values prototypes.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) distances = [[_ucrdtw.ucrdtw(np.array(v["average"]), np.array(w["average"]), 0.05 * 120, False)[1] for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)] response = orjson.dumps({'prototypes': prototypes, 'distances': distances}) print("Averages calculated: " + str(time() - t0)) return response def preprocess(data): # return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037 # data = np.load('processed-data.npy') data = np.array(data, dtype='double') # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0]))) # data = np.repeat(data, repeats=1, axis=1) subset = [] t0 = time() r = 3 for i, window in enumerate(data): if i % 10000 == 0: print(str(i) + ':' + str(len(subset))) state = 1 for s in subset: if np.linalg.norm(window - data[s]) < r: state = 0 break if state == 1: subset.append(i) # subset = sample(list(range(len(data))), 50) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): print(i) for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = np.linalg.norm(data[index_1] - data[index_2]) eq_distances.append(e) d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1] # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance dtw_distances.append(d) ratios = np.array(dtw_distances)/np.array(eq_distances) mean_dtw = np.mean(dtw_distances) sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a = np.mean(ratios) sd = np.std(ratios) theta = mean_dtw + -2.58 * sd_dtw # theta = mean_eq + -2.58 * sd_eq r = theta / ((a-sd)*math.sqrt(120)) # r = theta / (math.sqrt(120)) print('Mean: ' + str(mean_dtw)) print('Stdev: ' + str(sd_dtw)) print('Ratio mean: ' + str(a)) print('Ratio stdev: ' + str(sd)) print('Theta: ' + str(theta)) print('r: ' + str(r)) print('Preprocessing time: ' + str(time() - t0)) return r, a, sd def debug_test_lsh(): data = np.load('processed-data.npy') r, a, sd = preprocess(data) create_windows() query_n = 80503 query = data[query_n] # performDBA(data[[80503, 11514]]) query = np.reshape(query, (len(data[0]), 1)) data= np.array(data, dtype='double') data = np.reshape(data, (len(data), len(data[0]), 1)) data = np.repeat(data, repeats=1, axis=2) candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print(repr(candidates[0:20])) print(distances[0:10]) data = np.load('processed-data.npy') query = data[query_n] print(data[0]) distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data] sorted_distances = sorted(distances) print(sorted_distances[0:10]) topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) print(topk_dtw[0:10]) # # distances_ed = [distance.euclidean(query, window) for window in data] # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # accuracy = 0 for index in topk_dtw[0:20]: if index in candidates[0:20]: accuracy += 1 print(accuracy) # debug_test_lsh()