from flask import Flask, jsonify, request import pandas as pd import numpy as np from flask_cors import CORS from collections import defaultdict, Counter from time import time import os.path import json from sklearn import preprocessing import orjson import dask.dataframe as dd import bigwig import bbi from bitarray import bitarray import _ucrdtw import _lsh from scipy.spatial import distance from scipy.sparse import dia_matrix from fastdtw import fastdtw from scipy.spatial.distance import euclidean import dtw import math from random import sample reload = False app = Flask(__name__) CORS(app) def calculate_signatures_random_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.random.uniform(-1, 1, size=(window_size, hash_size)) signatures_bool = np.dot(data, hash_function) > 0 if signatures_bool.ndim == 1: return ''.join(['1' if x else '0' for x in signatures_bool]) return [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool], hash_function def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose() signatures_bool = np.dot(data, hash_function) > 0 if hash_size is None: signatures_int = np.packbits(signatures_bool) else: signatures_int = np.packbits(signatures_bool, axis=1).flatten() return signatures_int.tolist(), hash_function def calculate_signatures_normal_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.array([np.random.normal(0, 1, window_size) for _ in range(hash_size)]).transpose() signatures_bool = np.dot(data, hash_function) > 0 if hash_size is None: signatures_int = np.packbits(signatures_bool) else: signatures_int = np.packbits(signatures_bool, axis=1).flatten() return signatures_int.tolist(), hash_function def calculate_signatures_normal_split_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = [] interval = int(window_size / hash_size) empty = np.zeros(window_size) for i in range(hash_size): copy = np.copy(empty) copy[i * interval:(i+1) * interval] = np.random.normal(0, 1, interval) hash_function.append(copy) hash_function = np.array(hash_function).transpose() signatures_bool = np.dot(data, hash_function) > 0 if hash_size is None: signatures_int = np.packbits(signatures_bool) else: signatures_int = np.packbits(signatures_bool, axis=1).flatten() return signatures_int.tolist(), hash_function def calculate_signatures_new(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose() if len(data) == len(np.array(hash_function)[:, 0]): signatures_bool = np.dot(data, hash_function) > 0 output = signatures_bool.astype(int)[0] print(output) return output print('starting hashing') t0 = time() all_signatures = [] batch_size = 20 data = data.transpose() temp = np.zeros((batch_size, window_size + batch_size - 1)) for h in range(hash_size): for i in range(batch_size): temp[i, i:i + window_size] = hash_function[:, h] print('first: ' + str(time() - t0)) signatures_bool = [np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0 for i in range(0, len(data) - window_size, batch_size)] # signatures_bool = [] # for i in range(0, len(data) - window_size, batch_size): # if i % 1000000 == 0: # print(i) # signatures_bool.append(np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0) print('second: ' + str(time() - t0)) all_signatures.append(np.array(signatures_bool).flatten().astype(int)) print('done') signatures_int = np.packbits(np.stack(np.array(all_signatures), axis=1), axis=0).flatten() return signatures_int.tolist(), hash_function lsh_function = calculate_signatures_normal_weights @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() } response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) return response @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if reload: # raw_data = request.json # window_size = int(raw_data['parameters']["windowsize"]) window_size = 120 chromsize = bbi.chromsizes('test.bigWig')['chr1'] step_size = int(12000 / 6) start_bps = np.arange(0, chromsize - 12000 + step_size, step_size) end_bps = np.arange(12000, chromsize + step_size, step_size) data = bigwig.chunk( 'test.bigWig', 12000, int(12000 / window_size), int(12000 / 6), ['chr1'], verbose=True, ) # data = bbi.stackup( # 'test.bigWig', # ['chr1'] * start_bps.size, # start_bps, # end_bps, # bins=window_size, # missing=0.0, # oob=0.0, # ) # data = (data - np.min(data))/np.ptp(data) print(data.shape) np.save('processed-data', data) np.savetxt('processed-data', data, delimiter=' ', fmt='%f') print('Windows created: ' + str(time()-t0)) return '1' @app.route('/create-tables', methods=['POST']) def create_tables(): data = np.load('processed-data.npy') raw_data = orjson.loads(request.data) window_size = int(raw_data['parameters']["windowsize"]) hash_size = int(raw_data['parameters']["hashsize"]) table_size = int(raw_data['parameters']["tablesize"]) t0 = time() r, a, sd = preprocess() lsh_method(r, a, sd) hash_functions, tables = lsh(data, window_size, hash_size, table_size) response = {} for table_index in range(table_size): response[str(table_index)] = { "hash": hash_functions[table_index], "entries": tables[table_index] } response = jsonify(response) print('done: ' + str(time()-t0)) return response def lsh(data, window_size, hash_size, table_size): tables_hash_function = [] tables = [] print(data.shape) for index in range(table_size): signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size) print(index) table = defaultdict(list) for v, k in enumerate(signatures): table[k].append(v) tables.append(table) tables_hash_function.append(hash_function.tolist()) hash_functions = tables_hash_function return hash_functions, tables @app.route('/similarity', methods=['POST']) def similarity(): t0 = time() raw_data = orjson.loads(request.data) window = raw_data['query'] tables = raw_data["tables"] neighbours = [] output = defaultdict(list) i = 0 for t in tables.values(): print(i) signatures, _ = lsh_function(window, hash_function=t["hash"]) neighbours.extend(t["entries"][str(signatures[0])]) i = i+1 neighbours_with_frequency = dict(Counter(neighbours)) for index, frequency in neighbours_with_frequency.items(): output[str(frequency)].append(index) response = orjson.dumps(output) print("Similarity done: " + str(time()-t0)) return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') label_data = raw_data["labelData"] tables = raw_data["tables"] window = raw_data["query"] window_size = int(raw_data['parameters']["windowsize"]) hash_size = int(raw_data['parameters']["hashsize"]) table_size = int(raw_data['parameters']["tablesize"]) new_tables = [] correct_indices = [int(index) for index, value in label_data.items() if value is True] incorrect_indices = [int(index) for index, value in label_data.items() if value is False] for t in tables.values(): valid = True signatures, _ = lsh_function(window, hash_function=t['hash']) neighbours = t["entries"][str(signatures[0])] for index in correct_indices: if index not in neighbours: valid = False break for index in incorrect_indices: if index in neighbours: valid = False break if valid: new_tables.append(t) for index in range(table_size - len(new_tables)): entries = defaultdict(list) t1 = time() while True: correct_signatures, hash_function = lsh_function(data[correct_indices], window_size=window_size, hash_size=hash_size) incorrect_signatures, _ = lsh_function(data[incorrect_indices], hash_function=hash_function) if correct_signatures.count(correct_signatures[0]) == len(correct_signatures) and incorrect_signatures.count(correct_signatures[0]) == 0: break signatures, _ = lsh_function(data, hash_function=hash_function) for i in range(len(signatures)): entries[signatures[i]].append(i) print(str(index) + ": " + str(time() - t1)) new_tables.append({ "hash": hash_function.tolist(), "entries": entries }) print('Update time: ' + str(time() - t0)) response = {} for table_index in range(len(new_tables)): response[table_index] = { "hash": new_tables[table_index]["hash"], "entries": new_tables[table_index]["entries"] } response = jsonify(response) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) window = raw_data['window'] if isinstance(window, int): output = np.load('processed-data.npy')[window] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response else: print("OOOOOOOOOOOOOOOO") output = (window - np.min(window))/np.ptp(window) response = orjson.dumps(output.tolist()) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/average-progress', methods=['POST']) def average_progress(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') output = [] actual_windows = [] print("Starting average progress") print("Initialized: " + str(time() - t0)) for windows in all_windows: t1 = time() actual_windows.extend(data[windows]) if len(actual_windows) == 0: output.append([]) continue max_values = np.maximum.reduce(actual_windows).tolist() min_values = np.minimum.reduce(actual_windows).tolist() average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist() output = [({ 'average': average_values, 'max': max_values, 'min': min_values })] + output print("Average calculated: " + str(time() - t1)) response = orjson.dumps(output) print("Averages calculated: " + str(time() - t0)) return response @app.route('/average-table', methods=['POST']) def average_table(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') output = [] print("Initialized: " + str(time() - t0)) for windows in all_windows: t1 = time() actual_windows = data[windows] print(len(actual_windows)) average_values = np.average(actual_windows, 0) # average_values = (np.sum(actual_windows, 0) / len(actual_windows)) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values # max_values = np.maximum.reduce(actual_windows).tolist() # min_values = np.minimum.reduce(actual_windows).tolist() output.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) print("Average calculated: " + str(time() - t1)) response = orjson.dumps(output) print("Averages calculated: " + str(time() - t0)) return response def preprocess(): data = np.load('processed-data.npy') # data = np.array(data, dtype='double') # data = np.reshape(data, (int(len(data) / 1), 1, len(data[0]))) # data = np.repeat(data, repeats=1, axis=1) subset = [] # query = data[80503] t0 = time() # for i, window in enumerate(data): # print(i) # a = dtw.dtw(window, query, dist_method="Euclidean").distance # print(time() - t0) # print("done") r = 3 for i, window in enumerate(data): if i % 10000 == 0: print(str(i) + ':' + str(len(subset))) state = 1 for s in subset: if np.linalg.norm(window - data[s]) < r: state = 0 break if state == 1: subset.append(i) # # subset = sample(list(range(len(data))), 50) # print(subset) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): print(i) for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = distance.euclidean(data[index_1], data[index_2]) eq_distances.append(e) # d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 6}).distance # print(d-e) # if (e != 0): # dtw_distances.append(d)#(dtw.dtw(data[index_1], data[index_2], keep_internals=True).distance) # eq_distances.append(e) # else: # dtw_distances.append(0) # eq_distances.append(1) # ratios = np.array(dtw_distances)/np.array(eq_distances) # mean_dtw = np.mean(dtw_distances) # sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a=1 sd=1 # a = np.mean(ratios) # sd = np.std(ratios) # theta = mean_dtw + -2.58 * sd_dtw theta = mean_eq + -2.58 * sd_eq # r = theta / ((a-sd)*math.sqrt(120)) r = theta / (math.sqrt(120)) # print(mean_dtw) # print(sd_dtw) print(a) print(sd) print(theta) print(r) print(time() - t0) return r, a, sd def dtw_query(): data = np.load('processed-data.npy') data= np.array(data, dtype='double') data = np.repeat(data, repeats=1, axis=0) data = np.reshape(data, (int(len(data)/1), 1, len(data[0]))) query = data[80503] t0 = time() for i, window in enumerate(data): print(i) alignment = dtw.dtw(query, window, keep_internals=True) print(time() - t0) def lsh_method(r, a, sd): create_windows() query_n = 80503 dim = 10 data = np.load('processed-data.npy') data= np.array(data, dtype='double') data = np.reshape(data, (len(data), len(data[0]), 1)) data = np.repeat(data, repeats=1, axis=2) query = data[query_n] candidates, hf = _lsh.lsh(data, query, r, a, sd) print(repr(candidates[0:10])) # data = np.load('processed-data.npy') # query = data[query_n] # distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data] # topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) # print(topk_dtw[0:10]) distances_ed = [distance.euclidean(query, window) for window in data] topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:50]: # accuracy += 1 # print(accuracy) accuracy = 0 for index in topk_ed[0:20]: if index in candidates[0:20]: accuracy += 1 print(accuracy) accuracy = 0 for index in topk_ed[0:50]: if index in candidates[0:50]: accuracy += 1 print(accuracy) # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:1000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:5000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:10000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:50000]: # accuracy += 1 # print(accuracy) # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates: # accuracy += 1 # print(accuracy) # r, a, sd = preprocess() # lsh_method(r, a, sd) # create_windows() # query_n = 80503 # data = np.load('processed-data.npy') # data= np.array(data, dtype='double') # data = np.reshape(data, (len(data), len(data[0]), 1)) # data = np.repeat(data, repeats=10, axis=2) # query = data[query_n] # # candidates, hf = _lsh.lsh(data, query) # # data = np.load('processed-data.npy') # # query = data[query_n] # # data = np.load('processed-data.npy') # print(_ucrdtw.ucrdtw(data[query_n], data[0], 0.05, False)[1]) # # # l2_norm = lambda x, y: (x - y) ** 2 # # data = np.load('processed-data.npy') # data= np.array(data, dtype='double') # data = np.repeat(data, repeats=1, axis=0) # data = np.reshape(data, (int(len(data)/1), 1, len(data[0]))) # query = data[query_n] # # distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data] # # topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) # # print(topk_dtw[0:10]) # # # Generate our data # template = data[query_n] # rt,ct = template.shape # rq,cq = query.shape # t0 = time() # # Calculate the alignment vector and corresponding distance # alignment = dtw.dtw(query, template, keep_internals=True) # print(alignment.distance) # # print(time()-t0) # np.save('topk', np.array(topk_dtw)) print('done') # topk_dtw = np.load('topk.npy') # distances_ed = [distance.euclidean(query, window) for window in data] # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # # # accuracy = 0 # for index in topk_dtw[0:50]: # if index in candidates[0:50]: # accuracy += 1 # print(accuracy) # accuracy = 0 # output = [] # for index in topk_ed[0:50]: # if index in candidates: # accuracy += 1 # print(accuracy) # accuracy = 0 # for index in topk_ed[0:50]: # if index in candidates[0:50]: # accuracy += 1 # print(accuracy) # accuracy = 0 # for index in topk_ed[0:20]: # if index in candidates[0:20]: # accuracy += 1 # print(accuracy)