from flask import Flask, jsonify, request import pandas as pd import numpy as np from flask_cors import CORS from collections import defaultdict, Counter from time import time import os.path import json from sklearn import preprocessing import orjson import dask.dataframe as dd import bigwig import bbi from bitarray import bitarray import _ucrdtw from scipy.sparse import dia_matrix reload = True app = Flask(__name__) CORS(app) def calculate_signatures_random_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.random.uniform(-1, 1, size=(window_size, hash_size)) signatures_bool = np.dot(data, hash_function) > 0 if signatures_bool.ndim == 1: return ''.join(['1' if x else '0' for x in signatures_bool]) return [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool], hash_function def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose() signatures_bool = np.dot(data, hash_function) > 0 signatures_int = np.packbits(signatures_bool) return signatures_int.tolist(), hash_function def calculate_signatures_new(data, window_size=None, hash_size=None, hash_function=None): if hash_function is None: hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose() if len(data) == len(np.array(hash_function)[:, 0]): signatures_bool = np.dot(data, hash_function) > 0 output = signatures_bool.astype(int)[0] print(output) return output print('starting hashing') t0 = time() all_signatures = [] batch_size = 20 data = data.transpose() temp = np.zeros((batch_size, window_size + batch_size - 1)) for h in range(hash_size): for i in range(batch_size): temp[i, i:i + window_size] = hash_function[:, h] print('first: ' + str(time() - t0)) signatures_bool = [np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0 for i in range(0, len(data) - window_size, batch_size)] # signatures_bool = [] # for i in range(0, len(data) - window_size, batch_size): # if i % 1000000 == 0: # print(i) # signatures_bool.append(np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0) print('second: ' + str(time() - t0)) all_signatures.append(np.array(signatures_bool).flatten().astype(int)) print('done') signatures_int = np.packbits(np.stack(np.array(all_signatures), axis=1), axis=0).flatten() return signatures_int.tolist(), hash_function lsh_function = calculate_signatures_new @app.route('/', methods=['GET']) def index(): return "hi" @app.route('/read-data', methods=['GET']) def read_data(): t0 = time() size = bbi.chromsizes('test.bigWig')['chr1'] bins = 100000 data = bigwig.get('test.bigWig', 'chr1', 0, size, bins) print(data.shape) response = { "index": list(range(0, size, int(size/(bins)))), "values": data.tolist() } response = orjson.dumps(response) print('Data read: ' + str(time()-t0)) # query = data[12000:24000] # loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True) # print(data[loc:loc+120]) # print('found query: ' + str(loc) + '[' + str(time()-t0) + ']') return response @app.route('/create-windows', methods=['POST']) def create_windows(): t0 = time() if reload: raw_data = request.json window_size = int(raw_data['parameters']["windowsize"]) chromsize = bbi.chromsizes('test.bigWig')['chr1'] step_size = chromsize / 10000 data = bigwig.get('test.bigWig', 'chr1', 0, chromsize, 20000000) data = (data - np.min(data))/np.ptp(data) print(data.shape) np.save('processed-data', data) print('Windows created: ' + str(time()-t0)) return '1' @app.route('/create-tables', methods=['POST']) def create_tables(): data = np.load('processed-data.npy') raw_data = orjson.loads(request.data) window_size = int(raw_data['parameters']["windowsize"]) hash_size = int(raw_data['parameters']["hashsize"]) table_size = int(raw_data['parameters']["tablesize"]) t0 = time() hash_functions, tables = lsh(data, window_size, hash_size, table_size) response = {} for table_index in range(table_size): response[str(table_index)] = { "hash": hash_functions[table_index], "entries": tables[table_index] } response = jsonify(response) print('done: ' + str(time()-t0)) return response def lsh(data, window_size, hash_size, table_size): tables_hash_function = [] tables = [] print(data.shape) for index in range(table_size): signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size) print('creating dictionary') table = defaultdict(list) for v, k in enumerate(signatures): table[k].append(v) tables.append(table) tables_hash_function.append(hash_function.tolist()) hash_functions = tables_hash_function return hash_functions, tables @app.route('/similarity', methods=['POST']) def similarity(): t0 = time() raw_data = orjson.loads(request.data) window = raw_data['query'] tables = raw_data["tables"] neighbours = [] output = defaultdict(list) for t in tables.values(): signature = lsh_function(window, hash_function=t["hash"]) neighbours.extend(t["entries"][str(signature)]) neighbours_with_frequency = dict(Counter(neighbours)) for index, frequency in neighbours_with_frequency.items(): output[str(frequency)].append(index) response = orjson.dumps(output) print("Similarity done: " + str(time()-t0)) return response @app.route('/update', methods=['POST']) def update(): t0 = time() raw_data = orjson.loads(request.data) data = np.load('processed-data.npy') label_data = raw_data["labelData"] tables = raw_data["tables"] window = raw_data["query"] window_size = int(raw_data['parameters']["windowsize"]) hash_size = int(raw_data['parameters']["hashsize"]) table_size = int(raw_data['parameters']["tablesize"]) new_tables = [] correct_indices = [int(index) for index, value in label_data.items() if value is True] incorrect_indices = [int(index) for index, value in label_data.items() if value is False] for t in tables.values(): valid = True signature = lsh_function(window, hash_function=t['hash']) neighbours = t["entries"][signature] for index in correct_indices: if index not in neighbours: valid = False break for index in incorrect_indices: if index in neighbours: valid = False break if valid: new_tables.append(t) for index in range(table_size - len(new_tables)): entries = defaultdict(list) t1 = time() while True: correct_signatures, hash_function = lsh_function(data[correct_indices], window_size=window_size, hash_size=hash_size) incorrect_signatures, _ = lsh_function(data[incorrect_indices], hash_function=hash_function) if correct_signatures.count(correct_signatures[0]) == len(correct_signatures) and incorrect_signatures.count(correct_signatures[0]) == 0: break signatures, _ = lsh_function(data, hash_function=hash_function) for i in range(len(signatures)): entries[signatures[i]].append(i) print(str(index) + ": " + str(time() - t1)) new_tables.append({ "hash": hash_function.tolist(), "entries": entries }) print('Update time: ' + str(time() - t0)) response = {} for table_index in range(len(new_tables)): response[table_index] = { "hash": new_tables[table_index]["hash"], "entries": new_tables[table_index]["entries"] } response = jsonify(response) return response @app.route('/query', methods=['POST']) def query(): t0 = time() raw_data = orjson.loads(request.data) window = raw_data['window'] if isinstance(window, int): output = np.load('processed-data.npy')[window:window+12000] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response else : output = preprocessing.minmax_scale(window, (-1, 1)) response = orjson.dumps(output.tolist()) print("Query done: " + str(time()-t0)) return response @app.route('/window', methods=['POST']) def window(): t0 = time() raw_data = orjson.loads(request.data) indices = raw_data['indices'] output = np.load('processed-data.npy')[indices] response = orjson.dumps(output.tolist()) print("Query done: " + str(time() - t0)) return response @app.route('/average-progress', methods=['POST']) def average_progress(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') output = [] actual_windows = [] print("Starting average progress") print("Initialized: " + str(time() - t0)) for windows in all_windows: t1 = time() actual_windows.extend(data[windows]) if len(actual_windows) == 0: output.append([]) continue max_values = np.maximum.reduce(actual_windows).tolist() min_values = np.minimum.reduce(actual_windows).tolist() average_values = (np.sum(actual_windows, 0)/len(actual_windows)).tolist() output = [({ 'average': average_values, 'max': max_values, 'min': min_values })] + output print("Average calculated: " + str(time() - t1)) response = orjson.dumps(output) print("Averages calculated: " + str(time() - t0)) return response @app.route('/average-table', methods=['POST']) def average_table(): t0 = time() raw_data = orjson.loads(request.data) all_windows = raw_data['windows'] data = np.load('processed-data.npy') output = [] print("Initialized: " + str(time() - t0)) for windows in all_windows: t1 = time() actual_windows = data[windows] print(len(actual_windows)) average_values = np.average(actual_windows, 0) # average_values = (np.sum(actual_windows, 0) / len(actual_windows)) std_values = np.std(actual_windows, 0) max_values = average_values + std_values min_values = average_values - std_values # max_values = np.maximum.reduce(actual_windows).tolist() # min_values = np.minimum.reduce(actual_windows).tolist() output.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) print("Average calculated: " + str(time() - t1)) response = orjson.dumps(output) print("Averages calculated: " + str(time() - t0)) return response