import numpy as np from time import time import _ucrdtw import _lsh import math from libs.DBA_multivariate import performDBA from tslearn.metrics import dtw from collections import defaultdict """ data: 3d array [m][t][d] """ def get_lsh_parameters(data): parameters = preprocess(data) return [float(parameters[1]), float(parameters[1]), float(parameters[1])] """ data: 3d array [m][t][d] query: 2d array [t][d] """ def lsh(data, query, parameters = None, weights = None): if parameters is None: parameters = preprocess(data) r = parameters[0] a = parameters[1] sd = parameters[2] data = np.array(data, dtype='float32') query = np.array(query, dtype='float32') if weights is None: candidates, distances, hf = _lsh.lsh(data, query, r, a, sd, 1) else: candidates, distances, hf = _lsh.lsh(data, query, r, a, sd, 1, weights) dict = defaultdict(int) for l in range(len(candidates)): for k in range(len(candidates[0])): for i in range(len(candidates[0][0])): dict[candidates[l][k][i]] += distances[l][k][i] sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])} average_candidates = np.array(list(sorted_dict.keys())).tolist() average_distances = np.array(list(sorted_dict.values())).tolist() tables = [] samples_set = set() candidates = candidates.tolist() for l in range(len(candidates)): for k in range(len(candidates[0])): samples_set.update(candidates[l][k][0:5]) samples_set.update(candidates[l][k][-100:-95]) dict = defaultdict(list) length = len(distances[l][k]) median = distances[l][k][math.ceil(length/2)] stepsize = median / 10 indices = list(map(lambda x: 19 if x > median * 2 else math.floor(x / stepsize), distances[l][k])) for i in range(len(candidates[0][0])): dict[str(indices[i])].append(candidates[l][k][i]) tables.append(dict) length = len(average_distances) median = average_distances[math.ceil(length/2)] stepsize = median / 10 indices = list(map(lambda x: 19 if x > median * 2 else math.floor(x / stepsize), average_distances)) average_table = defaultdict(list) for i in range(len(average_candidates)): average_table[str(indices[i])].append(average_candidates[i]) samples = np.array(list(filter(lambda x: x in samples_set, average_candidates))).tolist() response = { "hash_functions": hf.reshape((len(candidates) * len(candidates[0]), len(query[0]))).tolist(), "candidates": candidates, "distances": distances.tolist(), "average_candidates": average_candidates, "average_distances": average_distances, "tables": tables, "average_table": average_table, "samples": list(samples), "parameters": [float(r), float(a), float(sd)] } return response def preprocess(data, r=None): subset = [] t0 = time() if r is None: r = 19.375 # r = data.shape[2] started = False first = 0 last = r * 2 i = 0 print("r = " + str(r)) while True: state = 1 for s in subset: if np.linalg.norm(data[i] - data[s]) < r: state = 0 break if state == 1: subset.append(i) i = i + 1 if len(subset) > 400: print('bigger') if not started: first = r last = last * 2 else: first = r r = (first + last) / 2 subset = [] i = 0 print("r = " + str(r)) elif (i == 10000 or i == len(data)) and len(subset) < 10: print('smaller') started = True last = r r = (first + last) / 2 # r / 2 subset = [] i = 0 print("r = " + str(r)) elif (i == len(data)): break # subset = sample(list(range(len(data))), 200) dtw_distances = [] eq_distances = [] for i, index_1 in enumerate(subset): for j, index_2 in enumerate(subset): if index_1 == index_2: continue e = np.linalg.norm(data[index_1] - data[index_2]) if (math.isnan(e) or e == 0): eq_distances.append(0.0001) dtw_distances.append(0.0001) continue eq_distances.append(e) d = dtw(data[index_1], data[index_2], global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) dtw_distances.append(d) ratios = np.array(dtw_distances)/np.array(eq_distances) mean_dtw = np.mean(dtw_distances) sd_dtw = np.std(dtw_distances) mean_eq = np.mean(eq_distances) sd_eq = np.std(eq_distances) a = np.mean(ratios) sd = np.std(ratios) theta = mean_dtw + -2.58 * sd_dtw # theta = mean_eq + -2.58 * sd_eq r = theta / ((a-sd)*math.sqrt(120)) if r < 0: print('Actual r ' + str(r)) r = mean_dtw / 100 # r = theta / (math.sqrt(120)) print('Mean: ' + str(mean_dtw)) print('Stdev: ' + str(sd_dtw)) print('Ratio mean: ' + str(a)) print('Ratio stdev: ' + str(sd)) print('Theta: ' + str(theta)) print('r: ' + str(r)) print('Preprocessing time: ' + str(time() - t0)) return r, a, sd def weights(data, query, old_weights, labels, hash_functions): alpha = 0.2 d = len(query) print(d) all_good_windows = data[[[int(index) for index, value in labels.items() if value is True]]] def normalize(array): array /= np.sum(array) array *= d return np.sqrt(array) good_distances = np.zeros(len(query)) for window in all_good_windows: for i in range(len(all_good_windows[0])): good_distances[i] += _ucrdtw.ucrdtw(query[i], window[i], 0.05, False)[1] if len(all_good_windows) != 0: good_distances = np.square(good_distances) if np.sum(good_distances) != 0: good_distances /= np.sum(good_distances) good_distances = np.ones(len(query)) - good_distances good_distances = normalize(good_distances) if len(hash_functions) != 0: summed_hash_functions = np.sum(hash_functions, axis=0) summed_hash_functions = np.square(summed_hash_functions) normalized_hash_functions = normalize(summed_hash_functions) if len(hash_functions) + len(all_good_windows) == 0: print("no update") new_weights = old_weights elif len(hash_functions) == 0: print("only windows") new_weights = alpha * np.array(old_weights) + (1 - alpha) * good_distances new_weights = normalize(np.square(new_weights)) elif len(all_good_windows) == 0: print("only tables") new_weights = alpha * np.array(old_weights) + (1 - alpha) * normalized_hash_functions new_weights = normalize(np.square(new_weights)) else: print("tables & windows") new_weights = alpha * np.array(old_weights) + 0.5 * (1-alpha) * good_distances + 0.5 * (1-alpha) * normalized_hash_functions new_weights = normalize(np.square(new_weights)) print(new_weights) return new_weights.tolist() def table_info(data, table): prototypes = [] for cluster in table: windows = data[cluster] average_values = np.average(windows, 0) std_values = np.std(windows, 0) max_values = average_values + std_values min_values = average_values - std_values prototypes.append({ 'average': average_values.tolist(), 'max': max_values.tolist(), 'min': min_values.tolist() }) # distances = [[dtw(np.array(v["average"]), np.array(w["average"]), global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05 * 120)) for j, w in enumerate(prototypes)] for i, v in enumerate(prototypes)] return {'prototypes': prototypes, 'distances': []} def query(data, window_indices): if isinstance(window_indices, int): output = data[window_indices] else: indices = [int(index) for index, value in window_indices.items() if value is True] indices_windows = data[indices] output = performDBA(indices_windows) return output.tolist() def debug_test_lsh(): data = np.load('data/processed-data.npy') # data = np.repeat(data, repeats=7, axis=1) print(data.shape) data = np.reshape(data, (len(data), len(data[0][0]), len(data[0]))) data = np.array(data, dtype='float32') print(data.shape) r, a, sd = preprocess(data, 11.25) query_n = 1234 t0 = time() query = data[query_n] dict = defaultdict(int) candidates, distances, hf = _lsh.lsh(data, query, r, a, sd) print("Calculated approximate in: " + str(time()-t0)) for l in range(len(candidates)): for k in range(len(candidates[0])): for i in range(len(candidates[0][0])): dict[candidates[l][k][i]] += distances[l][k][i] sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])} candidates = list(sorted_dict.keys()) print("Calculated ranked in: " + str(time()-t0)) print(candidates[0:20]) t0 = time() # distances = [dtw_ndim.distance_fast(window, query) for window in data] distances = [dtw(window, query, global_constraint='sakoe_chiba', sakoe_chiba_radius=int(0.05*120)) for window in data] topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k]) print("Calculated exact dtw in: " + str(time()-t0)) print(topk_dtw[0:20]) t0 = time() l2distances = [np.linalg.norm(window - query) for window in data] print("Calculated exact l2 in: " + str(time()-t0)) # # distances_ed = [distance.euclidean(query, window) for window in data] # # topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k]) # accuracy = 0 for index in topk_dtw[0:20]: if index in candidates: accuracy += 1 print(accuracy)