from flask import Flask, jsonify, request import pandas as pd import numpy as np from flask_cors import CORS from collections import defaultdict, Counter from time import time import os.path import json from sklearn import preprocessing import orjson import dask.dataframe as dd import bigwig import bbi from bitarray import bitarray import _ucrdtw import _lsh from scipy.spatial import distance from scipy.sparse import dia_matrix from fastdtw import fastdtw from scipy.spatial.distance import euclidean import dtw import math from random import sample from DBA import performDBA reload = False app = Flask(__name__) CORS(app) @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() } response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) return response @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if reload: # raw_data = request.json # window_size = int(raw_data['parameters']["windowsize"]) window_size = 120 chromsize = bbi.chromsizes('test.bigWig')['chr1'] step_size = int(12000 / 6) start_bps = np.arange(0, chromsize - 12000 + step_size, step_size) end_bps = np.arange(12000, chromsize + step_size, step_size) data = bigwig.chunk( 'test.bigWig', 12000, int(12000 / window_size), int(12000 / 6), ['chr1'], verbose=True, ) # data = bbi.stackup( # 'test.bigWig', # ['chr1'] * start_bps.size, # start_bps, # end_bps, # bins=window_size, # missing=0.0, # oob=0.0, # ) # data = (data - np.min(data))/np.ptp(data) print(data.shape) np.save('processed-data', data) np.savetxt('processed-data', data, delimiter=' ', fmt='%f') np.savetxt('query', data[80503], delimiter=' ', fmt='%f') print('Windows created: ' + str(time()-t0)) return '1' @app.route('/initialize', methods=['POST']) def initialize(): t0 = time() data = np.load('processed-data.npy') data= np.array(data, dtype='double') data = np.reshape(data, (len(data), len(data[0]), 1)) data = np.repeat(data, repeats=1, axis=2) raw_data = orjson.loads(request.data) query = raw_data["query"] query = np.reshape(query, (len(query), 1)) query = np.repeat(query, repeats=1, axis=1) r, a, sd = preprocess() candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) response = { "hash_functions": hf.tolist(), "candidates": candidates.tolist(), "distances": distances.tolist(), "parameters": [float(r), float(a), float(sd)] } response = orjson.dumps(response) print('done: ' + str(time()-t0)) return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') data = np.array(data, dtype='double') data = np.reshape(data, (len(data), len(data[0]), 1)) data = np.repeat(data, repeats=1, axis=2) # label_data = raw_data["labelData"] hash_functions = raw_data["hash_functions"] hash_functions = np.array(hash_functions, dtype='double') hash_functions = (hash_functions - np.min(hash_functions)) / np.ptp(hash_functions) hash_functions = np.reshape(hash_functions, (len(data[0]), 1)) query = raw_data["query"] query = np.reshape(query, (len(query), 1)) query = np.repeat(query, repeats=1, axis=1) parameters = raw_data["parameters"] candidates, distances, hf = _lsh.lsh(data, query, parameters[0], parameters[1], parameters[2], hash_functions) response = { "hash_functions": hf.tolist(), "distances": distances.tolist(), "candidates": candidates.tolist() } response = orjson.dumps(response) print('done: ' + str(time()-t0)) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) window = raw_data['window'] if isinstance(window, int): output = np.load('processed-data.npy')[window] response = { "average": output.tolist(), "distances": [] } response = orjson.dumps(response) print("Query done: " + str(time() - t0)) return response else: indices = [int(index) for index, value in window.items() if value is True] data = np.load('processed-data.npy')[indices] # average = np.sum(data, axis=0)/len(window) average = performDBA(data) # mins = np.absolute(data.min(axis=0) - average) # maxs = np.absolute(data.max(axis=0) - average) distances = np.absolute(np.sum(data, axis=0) - average * len(indices)) #np.maximum(mins, maxs)#np.array([0]*120) response = { "average": average.tolist(), "distances": distances.tolist() } response = orjson.dumps(response) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/average', methods=['POST']) def average(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') averages = [] print("Initialized: " + str(time() - t0)) print(len(all_windows)) for windows in all_windows: t1 = time() actual_windows = data[windows] print(len(actual_windows)) average_values = np.average(actual_windows, 0) # average_values = (np.sum(actual_windows, 0) / len(actual_windows)) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values # max_values = np.maximum.reduce(actual_windows).tolist() # min_values = np.minimum.reduce(actual_windows).tolist() averages.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) distances = [[_ucrdtw.ucrdtw(np.array(v["average"]), np.array(w["average"]), 0.05 * 120, False)[1] for j, w in enumerate(averages)] for i, v in enumerate(averages)] response = orjson.dumps({'averages': averages, 'distances': distances}) print("Averages calculated: " + str(time() - t0)) return response def preprocess(): return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037 data = np.load('processed-data.npy') data = np.array(data, dtype='double') data = np.reshape(data, (int(len(data) / 1), 1, len(data[0]))) data = np.repeat(data, repeats=1, axis=1) subset = [] # query = data[80503] t0 = time() # for i, window in enumerate(data): # print(i) # a = dtw.dtw(window, query, dist_method="Euclidean").distance # print(time() - t0) # print("done") r = 3 for i, window in enumerate(data): if i % 10000 == 0: print(str(i) + ':' + str(len(subset))) state = 1 for s in subset: if np.linalg.norm(window - data[s]) < r: state = 0 break if state == 1: subset.append(i) # # subset = sample(list(range(len(data))), 50) # print(subset) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): print(i) for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = np.linalg.norm(data[index_1] - data[index_2]) eq_distances.append(e) d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance dtw_distances.append(d) ratios = np.array(dtw_distances)/np.array(eq_distances) mean_dtw = np.mean(dtw_distances) sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a = np.mean(ratios) sd = np.std(ratios) theta = mean_dtw + -2.58 * sd_dtw # theta = mean_eq + -2.58 * sd_eq r = theta / ((a-sd)*math.sqrt(120)) # r = theta / (math.sqrt(120)) print(mean_dtw) print(sd_dtw) print(a) print(sd) print(theta) print(r) print(time() - t0) return r, a, sd def dtw_query(): data = np.load('processed-data.npy') data= np.array(data, dtype='double') query = data[80503] t0 = time() distances = _ucrdtw.ucrdtw(data, query, 0.05) print(distances) # distances = [_ucrdtw.ucrdtw(window, query, 0.05) for window in data] print(time() - t0) def lsh_method(r, a, sd): create_windows() query_n = 80503 data = np.load('processed-data.npy') query = performDBA(data[[80503, 11514]]) query = np.reshape(query, (len(data[0]), 1)) data= np.array(data, dtype='double') data = np.reshape(data, (len(data), len(data[0]), 1)) data = np.repeat(data, repeats=1, axis=2) # query = data[query_n] candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print(repr(candidates[0:20])) print(distances[0:10]) print(np.where(candidates == 80503)) print(np.where(candidates == 11514)) data = np.load('processed-data.npy') query = data[query_n] distances = [_ucrdtw.ucrdtw(window, query, 0.05 * 120, False)[1] for window in data] topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) print(topk_dtw[0:10]) # # for candidate in candidates[0:20]: # print(_ucrdtw.ucrdtw(data[candidate], query, 0.05, False)[1]) # # # distances_ed = [distance.euclidean(query, window) for window in data] # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # # accuracy = 0 # for index in topk_dtw[0:20]: # if index in candidates[0:20]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:20]: # if index in candidates[0:50]: # accuracy += 1 # print(accuracy) # # # accuracy = 0 # # for index in topk_ed[0:20]: # # if index in candidates[0:20]: # # accuracy += 1 # # print(accuracy) # # # # accuracy = 0 # # for index in topk_ed[0:50]: # # if index in candidates[0:50]: # # accuracy += 1 # # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:1000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:5000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:10000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:50000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates: # accuracy += 1 # print(accuracy) # r, a, sd = preprocess() # lsh_method(r, a, sd)