In [1]:
import pandas as pd
import numpy as np
from time import time

datafile = 'data/21.csv'

N = 100
T = 100
M = 100000

data = np.random.uniform(size=(M, T, N))

#and convert it to numpy array:
data = np.array(data, dtype = "float32")

We sample a number of subwindows which will be used as query for the search algorithms

In [2]:
import random
from time import time

targets = random.sample(list(range(len(data))), 10)
print(targets)

[23484, 81670, 8609, 31051, 42832, 14303, 85653, 67886, 17319, 93188]


## PSEUDo

For the LSH algorithm some preprocessing is done to find the right LSH parameters.

In [3]:
import sys

sys.path.insert(0, '../Flaskserver')
import importlib
from pseudo import preprocess
import _lsh

topk_dtw = []

print('Preprocessing:')
t0 = time()
r,a,sd = preprocess(data, data.shape[2])
print('Preprocessing done. Took {:.2f} seconds ({:.1f} minutes).'.format(time() - t0, (time() - t0) / 60))
pseudo_preprocess_time = time() - t0

Preprocessing:
r = 100
smaller
r = 50.0
smaller
r = 25.0
bigger
r = 37.5
bigger
r = 43.75
smaller
r = 40.625
bigger
r = 42.1875
smaller
r = 41.40625
smaller
r = 41.015625
Mean: 41.028657241907524
Stdev: 0.16600796717920388
Ratio mean: 0.9930308931432998
Ratio stdev: 0.004133342888573115
Theta: 40.60035668658518
r: 3.7478993980502606
Preprocessing time: 11.971427917480469
Preprocessing done. Took 11.97 seconds (0.2 minutes).


Now we run the LSH algorithm for all targets and calculate the most similar subwindows

In [4]:
from collections import defaultdict
t0 = time()
total_lsh_times = []
all_lsh_candidates = []
for i, target in enumerate(targets):
 t1 = time()
 query = data[target]
 print('doing lsh')
 lsh_candidates, lsh_distances, _ = _lsh.lsh(data, query, r, a, sd, 0)
# topk_dtw.append(candidates)
 dict = defaultdict(int)
 for l in range(len(lsh_candidates)):
 for k in range(len(lsh_candidates[0])):
 for a in range(len(lsh_candidates[0][0])):
 dict[lsh_candidates[l][k][a]] += lsh_distances[l][k][a]
 sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
 candidates = list(sorted_dict.keys())
 total_lsh_times.append(time()-t1)
 print('Target #{} done! Took {:.2f} seconds ({:.1f} minutes).'.format(i, time() - t1, (time() - t1) / 60))
 all_lsh_candidates.append(candidates)
 
# print(candidates[0:10])
print('Done! Took {:.2f} seconds ({:.1f} minutes).'.format(time() - t0, (time() - t0) / 60))

doing lsh
Target #0 done! Took 14.48 seconds (0.2 minutes).
doing lsh
Target #1 done! Took 14.31 seconds (0.2 minutes).
doing lsh
Target #2 done! Took 14.30 seconds (0.2 minutes).
doing lsh
Target #3 done! Took 14.17 seconds (0.2 minutes).
doing lsh
Target #4 done! Took 14.30 seconds (0.2 minutes).
doing lsh
Target #5 done! Took 14.31 seconds (0.2 minutes).
doing lsh
Target #6 done! Took 14.18 seconds (0.2 minutes).
doing lsh
Target #7 done! Took 13.75 seconds (0.2 minutes).
doing lsh
Target #8 done! Took 13.80 seconds (0.2 minutes).
doing lsh
Target #9 done! Took 14.17 seconds (0.2 minutes).
Done! Took 141.76 seconds (2.4 minutes).


In [None]:
from collections import defaultdict
t0 = time()
total_lsh_times_ed = []
all_lsh_candidates_ed = []
for i, target in enumerate(targets):
 t1 = time()
 query = data[target]
 print('doing lsh')
 lsh_candidates, lsh_distances, _ = _lsh.lsh(data, query, r, a, sd, 1)
# topk_dtw.append(candidates)
 dict = defaultdict(int)
 for l in range(len(lsh_candidates)):
 for k in range(len(lsh_candidates[0])):
 for a in range(len(lsh_candidates[0][0])):
 dict[lsh_candidates[l][k][a]] += lsh_distances[l][k][a]
 sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
 candidates = list(sorted_dict.keys())
 total_lsh_times_ed.append(time()-t1)
 print('Target #{} done! Took {:.2f} seconds ({:.1f} minutes).'.format(i, time() - t1, (time() - t1) / 60))
 all_lsh_candidates_ed.append(candidates)
 
# print(candidates[0:10])
print('Done! Took {:.2f} seconds ({:.1f} minutes).'.format(time() - t0, (time() - t0) / 60))

doing lsh
Target #0 done! Took 9.85 seconds (0.2 minutes).
doing lsh
Target #1 done! Took 9.73 seconds (0.2 minutes).
doing lsh
Target #2 done! Took 9.58 seconds (0.2 minutes).
doing lsh
Target #3 done! Took 9.60 seconds (0.2 minutes).
doing lsh
Target #4 done! Took 9.84 seconds (0.2 minutes).
doing lsh
Target #5 done! Took 9.74 seconds (0.2 minutes).
doing lsh
