Commit 068c2d6f authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Using new lsh algorithm (not integrated well)

parent bb598b63
......@@ -17,9 +17,9 @@ export class CacheService {
private _sliderValue;
private _queryWindow;
public windowSize = 200;
public windowSize = 120;
public nrOfTables = 5;
public hashSize = 2;
public hashSize = 5;
public stepSize = 200;
public querySelectionMode = true;
......
......@@ -64,6 +64,15 @@ export class LabelingWindowComponent implements OnInit {
}
}
this.topk = topk;
// const candidates = [1254, 483, 103425, 2589, 7524];
const candidates = [ 80503, 8277, 26256, 118148, 22734, 473, 79996, 78224,
33755, 57522];
;
this.topk = [];
for (const candidate of candidates) {
this.topk.push({index: candidate, frequency: 100});
}
console.log(this.topk);
await this.createPlots();
}
......
......@@ -4,7 +4,7 @@
<mat-tab-group animationDuration="0ms" (selectedTabChange)="changeTab($event)">
<mat-tab label="Training">
<app-labeling-window></app-labeling-window>
<app-table-overview></app-table-overview>
<!-- <app-table-overview></app-table-overview>-->
</mat-tab>
<mat-tab label="Labeled data">
<app-labels></app-labels>
......
......@@ -172,7 +172,7 @@ export class OverviewWindowComponent implements OnInit {
x: clickData.x,
y: clickData.y
});
const index = 80503// Math.floor(xyInformation[0].scalenumvalue / (12000 / 6));
const index = 80503;// Math.floor(xyInformation[0].scalenumvalue / (12000 / 6));
this.service.queryWindow = await this.service.getQueryWindow(index);
const temp = {};
temp[index] = true;
......
......@@ -20,7 +20,11 @@
</component>
<component name="ChangeListManager">
<list default="true" id="556080ba-825c-4b55-a92a-867a4df4fb32" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/setup.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/../AngularApp/prototype/src/app/cache.service.ts" beforeDir="false" afterPath="$PROJECT_DIR$/../AngularApp/prototype/src/app/cache.service.ts" afterDir="false" />
<change beforePath="$PROJECT_DIR$/../AngularApp/prototype/src/app/labeling-window/labeling-window.component.ts" beforeDir="false" afterPath="$PROJECT_DIR$/../AngularApp/prototype/src/app/labeling-window/labeling-window.component.ts" afterDir="false" />
<change beforePath="$PROJECT_DIR$/../AngularApp/prototype/src/app/main/main.component.html" beforeDir="false" afterPath="$PROJECT_DIR$/../AngularApp/prototype/src/app/main/main.component.html" afterDir="false" />
<change beforePath="$PROJECT_DIR$/../AngularApp/prototype/src/app/overview-window/overview-window.component.ts" beforeDir="false" afterPath="$PROJECT_DIR$/../AngularApp/prototype/src/app/overview-window/overview-window.component.ts" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/processed-data.npy" beforeDir="false" afterPath="$PROJECT_DIR$/processed-data.npy" afterDir="false" />
......@@ -130,10 +134,10 @@
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="686" y="355" width="664" height="403" key="#com.intellij.fileTypes.FileTypeChooser/72.27.1848.1053@72.27.1848.1053" timestamp="1601384380854" />
<state x="721" y="422" width="1200" height="800" key="DiffContextDialog" timestamp="1601242420342">
<state x="479" y="254" width="1200" height="800" key="DiffContextDialog" timestamp="1601665758968">
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="721" y="422" width="1200" height="800" key="DiffContextDialog/72.27.1848.1053@72.27.1848.1053" timestamp="1601242420342" />
<state x="479" y="254" width="1200" height="800" key="DiffContextDialog/72.27.1848.1053@72.27.1848.1053" timestamp="1601665758968" />
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl" timestamp="1601285087193">
<screen x="72" y="27" width="1848" height="1053" />
</state>
......
......@@ -13,9 +13,16 @@ import bigwig
import bbi
from bitarray import bitarray
import _ucrdtw
import _lsh
from scipy.spatial import distance
from scipy.sparse import dia_matrix
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import dtw
import math
from random import sample
reload = True
reload = False
app = Flask(__name__)
CORS(app)
......@@ -33,7 +40,37 @@ def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None,
if hash_function is None:
hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose()
signatures_bool = np.dot(data, hash_function) > 0
signatures_int = np.packbits(signatures_bool)
if hash_size is None:
signatures_int = np.packbits(signatures_bool)
else:
signatures_int = np.packbits(signatures_bool, axis=1).flatten()
return signatures_int.tolist(), hash_function
def calculate_signatures_normal_weights(data, window_size=None, hash_size=None, hash_function=None):
if hash_function is None:
hash_function = np.array([np.random.normal(0, 1, window_size) for _ in range(hash_size)]).transpose()
signatures_bool = np.dot(data, hash_function) > 0
if hash_size is None:
signatures_int = np.packbits(signatures_bool)
else:
signatures_int = np.packbits(signatures_bool, axis=1).flatten()
return signatures_int.tolist(), hash_function
def calculate_signatures_normal_split_weights(data, window_size=None, hash_size=None, hash_function=None):
if hash_function is None:
hash_function = []
interval = int(window_size / hash_size)
empty = np.zeros(window_size)
for i in range(hash_size):
copy = np.copy(empty)
copy[i * interval:(i+1) * interval] = np.random.normal(0, 1, interval)
hash_function.append(copy)
hash_function = np.array(hash_function).transpose()
signatures_bool = np.dot(data, hash_function) > 0
if hash_size is None:
signatures_int = np.packbits(signatures_bool)
else:
signatures_int = np.packbits(signatures_bool, axis=1).flatten()
return signatures_int.tolist(), hash_function
def calculate_signatures_new(data, window_size=None, hash_size=None, hash_function=None):
......@@ -67,7 +104,7 @@ def calculate_signatures_new(data, window_size=None, hash_size=None, hash_functi
return signatures_int.tolist(), hash_function
lsh_function = calculate_signatures_new
lsh_function = calculate_signatures_normal_weights
@app.route('/', methods=['GET'])
def index():
......@@ -86,24 +123,40 @@ def read_data():
}
response = orjson.dumps(response)
print('Data read: ' + str(time()-t0))
# query = data[12000:24000]
# loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True)
# print(data[loc:loc+120])
# print('found query: ' + str(loc) + '[' + str(time()-t0) + ']')
return response
@app.route('/create-windows', methods=['POST'])
def create_windows():
t0 = time()
if reload:
raw_data = request.json
window_size = int(raw_data['parameters']["windowsize"])
# raw_data = request.json
# window_size = int(raw_data['parameters']["windowsize"])
window_size = 120
chromsize = bbi.chromsizes('test.bigWig')['chr1']
step_size = chromsize / 10000
data = bigwig.get('test.bigWig', 'chr1', 0, chromsize, 20000000)
data = (data - np.min(data))/np.ptp(data)
step_size = int(12000 / 6)
start_bps = np.arange(0, chromsize - 12000 + step_size, step_size)
end_bps = np.arange(12000, chromsize + step_size, step_size)
data = bigwig.chunk(
'test.bigWig',
12000,
int(12000 / window_size),
int(12000 / 6),
['chr1'],
verbose=True,
)
# data = bbi.stackup(
# 'test.bigWig',
# ['chr1'] * start_bps.size,
# start_bps,
# end_bps,
# bins=window_size,
# missing=0.0,
# oob=0.0,
# )
# data = (data - np.min(data))/np.ptp(data)
print(data.shape)
np.save('processed-data', data)
np.savetxt('processed-data', data, delimiter=' ', fmt='%f')
print('Windows created: ' + str(time()-t0))
return '1'
......@@ -116,6 +169,8 @@ def create_tables():
table_size = int(raw_data['parameters']["tablesize"])
t0 = time()
r, a, sd = preprocess()
lsh_method(r, a, sd)
hash_functions, tables = lsh(data, window_size, hash_size, table_size)
response = {}
......@@ -136,7 +191,7 @@ def lsh(data, window_size, hash_size, table_size):
for index in range(table_size):
signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size)
print('creating dictionary')
print(index)
table = defaultdict(list)
for v, k in enumerate(signatures):
table[k].append(v)
......@@ -155,10 +210,12 @@ def similarity():
tables = raw_data["tables"]
neighbours = []
output = defaultdict(list)
i = 0
for t in tables.values():
signature = lsh_function(window, hash_function=t["hash"])
neighbours.extend(t["entries"][str(signature)])
print(i)
signatures, _ = lsh_function(window, hash_function=t["hash"])
neighbours.extend(t["entries"][str(signatures[0])])
i = i+1
neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items():
output[str(frequency)].append(index)
......@@ -185,8 +242,8 @@ def update():
for t in tables.values():
valid = True
signature = lsh_function(window, hash_function=t['hash'])
neighbours = t["entries"][signature]
signatures, _ = lsh_function(window, hash_function=t['hash'])
neighbours = t["entries"][str(signatures[0])]
for index in correct_indices:
if index not in neighbours:
valid = False
......@@ -231,12 +288,13 @@ def query():
raw_data = orjson.loads(request.data)
window = raw_data['window']
if isinstance(window, int):
output = np.load('processed-data.npy')[window:window+12000]
output = np.load('processed-data.npy')[window]
response = orjson.dumps(output.tolist())
print("Query done: " + str(time() - t0))
return response
else :
output = preprocessing.minmax_scale(window, (-1, 1))
else:
print("OOOOOOOOOOOOOOOO")
output = (window - np.min(window))/np.ptp(window)
response = orjson.dumps(output.tolist())
print("Query done: " + str(time()-t0))
return response
......@@ -307,4 +365,220 @@ def average_table():
print("Average calculated: " + str(time() - t1))
response = orjson.dumps(output)
print("Averages calculated: " + str(time() - t0))
return response
\ No newline at end of file
return response
def preprocess():
data = np.load('processed-data.npy')
# data = np.array(data, dtype='double')
# data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
# data = np.repeat(data, repeats=1, axis=1)
subset = []
# query = data[80503]
t0 = time()
# for i, window in enumerate(data):
# print(i)
# a = dtw.dtw(window, query, dist_method="Euclidean").distance
# print(time() - t0)
# print("done")
r = 3
for i, window in enumerate(data):
if i % 10000 == 0:
print(str(i) + ':' + str(len(subset)))
state = 1
for s in subset:
if np.linalg.norm(window - data[s]) < r:
state = 0
break
if state == 1:
subset.append(i)
#
# subset = sample(list(range(len(data))), 50)
# print(subset)
dtw_distances = []
eq_distances = []
for i, index_1 in enumerate(subset):
print(i)
for j, index_2 in enumerate(subset):
if index_1 == index_2:
continue
e = distance.euclidean(data[index_1], data[index_2])
eq_distances.append(e)
# d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 6}).distance
# print(d-e)
# if (e != 0):
# dtw_distances.append(d)#(dtw.dtw(data[index_1], data[index_2], keep_internals=True).distance)
# eq_distances.append(e)
# else:
# dtw_distances.append(0)
# eq_distances.append(1)
# ratios = np.array(dtw_distances)/np.array(eq_distances)
# mean_dtw = np.mean(dtw_distances)
# sd_dtw = np.std(dtw_distances)
mean_eq = np.mean(eq_distances)
sd_eq = np.std(eq_distances)
a=1
sd=1
# a = np.mean(ratios)
# sd = np.std(ratios)
# theta = mean_dtw + -2.58 * sd_dtw
theta = mean_eq + -2.58 * sd_eq
# r = theta / ((a-sd)*math.sqrt(120))
r = theta / (math.sqrt(120))
# print(mean_dtw)
# print(sd_dtw)
print(a)
print(sd)
print(theta)
print(r)
print(time() - t0)
return r, a, sd
def dtw_query():
data = np.load('processed-data.npy')
data= np.array(data, dtype='double')
data = np.repeat(data, repeats=1, axis=0)
data = np.reshape(data, (int(len(data)/1), 1, len(data[0])))
query = data[80503]
t0 = time()
for i, window in enumerate(data):
print(i)
alignment = dtw.dtw(query, window, keep_internals=True)
print(time() - t0)
def lsh_method(r, a, sd):
create_windows()
query_n = 80503
dim = 10
data = np.load('processed-data.npy')
data= np.array(data, dtype='double')
data = np.reshape(data, (len(data), len(data[0]), 1))
data = np.repeat(data, repeats=1, axis=2)
query = data[query_n]
candidates, hf = _lsh.lsh(data, query, r, a, sd)
print(repr(candidates[0:10]))
# data = np.load('processed-data.npy')
# query = data[query_n]
# distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data]
# topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
# print(topk_dtw[0:10])
distances_ed = [distance.euclidean(query, window) for window in data]
topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:50]:
# accuracy += 1
# print(accuracy)
accuracy = 0
for index in topk_ed[0:20]:
if index in candidates[0:20]:
accuracy += 1
print(accuracy)
accuracy = 0
for index in topk_ed[0:50]:
if index in candidates[0:50]:
accuracy += 1
print(accuracy)
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:1000]:
# accuracy += 1
# print(accuracy)
#
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:5000]:
# accuracy += 1
# print(accuracy)
#
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:10000]:
# accuracy += 1
# print(accuracy)
#
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:50000]:
# accuracy += 1
# print(accuracy)
#
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates:
# accuracy += 1
# print(accuracy)
# r, a, sd = preprocess()
# lsh_method(r, a, sd)
# create_windows()
# query_n = 80503
# data = np.load('processed-data.npy')
# data= np.array(data, dtype='double')
# data = np.reshape(data, (len(data), len(data[0]), 1))
# data = np.repeat(data, repeats=10, axis=2)
# query = data[query_n]
# # candidates, hf = _lsh.lsh(data, query)
# # data = np.load('processed-data.npy')
# # query = data[query_n]
#
# data = np.load('processed-data.npy')
# print(_ucrdtw.ucrdtw(data[query_n], data[0], 0.05, False)[1])
#
# # l2_norm = lambda x, y: (x - y) ** 2
#
# data = np.load('processed-data.npy')
# data= np.array(data, dtype='double')
# data = np.repeat(data, repeats=1, axis=0)
# data = np.reshape(data, (int(len(data)/1), 1, len(data[0])))
# query = data[query_n]
# # distances = [_ucrdtw.ucrdtw(window, query, 0.05, False)[1] for window in data]
# # topk_dtw = sorted(range(len(distances)), key=lambda k: distances[k])
# # print(topk_dtw[0:10])
#
# # Generate our data
# template = data[query_n]
# rt,ct = template.shape
# rq,cq = query.shape
# t0 = time()
# # Calculate the alignment vector and corresponding distance
# alignment = dtw.dtw(query, template, keep_internals=True)
# print(alignment.distance)
#
# print(time()-t0)
# np.save('topk', np.array(topk_dtw))
print('done')
# topk_dtw = np.load('topk.npy')
# distances_ed = [distance.euclidean(query, window) for window in data]
# topk_ed = sorted(range(len(distances_ed)), key=lambda k: distances_ed[k])
#
#
# accuracy = 0
# for index in topk_dtw[0:50]:
# if index in candidates[0:50]:
# accuracy += 1
# print(accuracy)
# accuracy = 0
# output = []
# for index in topk_ed[0:50]:
# if index in candidates:
# accuracy += 1
# print(accuracy)
# accuracy = 0
# for index in topk_ed[0:50]:
# if index in candidates[0:50]:
# accuracy += 1
# print(accuracy)
# accuracy = 0
# for index in topk_ed[0:20]:
# if index in candidates[0:20]:
# accuracy += 1
# print(accuracy)
from distutils.core import setup, Extension
import numpy.distutils.misc_util
c_ext = Extension('_lsh', ['../../lsh-fast/_lsh.cpp', '../../lsh-fast/lsh.cpp'])
setup(
name='lsh',
ext_modules=[c_ext],
include_dirs=numpy.distutils.misc_util.get_numpy_include_dirs(),
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment