Commit d746d449 authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Extracted lsh method from table creation


Former-commit-id: 95e90860
parent 1270bdb5
......@@ -20,7 +20,6 @@
</component>
<component name="ChangeListManager">
<list default="true" id="556080ba-825c-4b55-a92a-867a4df4fb32" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/../AngularApp/prototype/src/app/cache.service.ts" beforeDir="false" afterPath="$PROJECT_DIR$/../AngularApp/prototype/src/app/cache.service.ts" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
</list>
......
......@@ -57,36 +57,130 @@ def create_windows():
@app.route('/create-tables', methods=['POST'])
def create_tables():
t0 = time()
data = np.load('processed-data.npy')
raw_data = orjson.loads(request.data)
window_size = int(raw_data['parameters']["windowsize"])
hash_size = int(raw_data['parameters']["hashsize"])
table_size = int(raw_data['parameters']["tablesize"])
print('Starting: ' + str(time()-t0))
tables_hash_function = [np.random.uniform(-100, 100, size=(window_size, hash_size)) for _ in range(table_size)]
hash_functions, tables = lsh(data, window_size, hash_size, table_size)
response = {}
for table_index in range(table_size):
response[str(table_index)] = {
"hash": hash_functions[table_index],
"entries": tables[table_index]
}
response = orjson.dumps(response)
return response
def lsh(data, window_size, hash_size, table_size):
t0 = time()
print('Starting: ' + str(time() - t0))
tables_hash_function = []
print('Init time: ' + str(time() - t0))
tables = []
for index in range(table_size):
t1 = time()
table = defaultdict(list)
signatures_bool = np.dot(data, tables_hash_function[index]) > 0
signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
signatures, hash_function = calculate_signatures_random_weights(data, window_size=window_size, hash_size=hash_size)
for i in range(len(signatures)):
table[signatures[i]].append(i)
print(time()-t1)
tables.append(table)
tables_hash_function.append(hash_function.tolist())
print(time() - t1)
print('Creation time: ' + str(time() - t0))
hash_functions = np.array(tables_hash_function).tolist()
hash_functions = tables_hash_function
return hash_functions, tables
def calculate_signatures_random_weights(data, window_size=None, hash_size=None, hash_function=None):
if hash_function is None:
hash_function = np.random.uniform(-100, 100, size=(window_size, hash_size))
signatures_bool = np.dot(data, hash_function) > 0
if signatures_bool.ndim == 1:
return ''.join(['1' if x else '0' for x in signatures_bool])
return [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool], hash_function
@app.route('/similarity', methods=['POST'])
def similarity():
t0 = time()
raw_data = orjson.loads(request.data)
window = raw_data['query']
tables = raw_data["tables"]
neighbours = []
output = defaultdict(list)
for t in tables.values():
signature = calculate_signatures_random_weights(window, hash_function=t["hash"])
neighbours.extend(t["entries"][signature])
neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items():
output[str(frequency)].append(index)
response = orjson.dumps(output)
print("Similarity done: " + str(time()-t0))
return response
@app.route('/update', methods=['POST'])
def update():
t0 = time()
raw_data = orjson.loads(request.data)
data = np.load('processed-data.npy')
label_data = raw_data["labelData"]
tables = raw_data["tables"]
window = raw_data["query"]
window_size = int(raw_data['parameters']["windowsize"])
hash_size = int(raw_data['parameters']["hashsize"])
table_size = int(raw_data['parameters']["tablesize"])
new_tables = []
correct_indices = [int(index) for index, value in label_data.items() if value is True]
incorrect_indices = [int(index) for index, value in label_data.items() if value is False]
for t in tables.values():
valid = True
signature = calculate_signatures_random_weights(window, hash_function=t['hash'])
neighbours = t["entries"][signature]
for index in correct_indices:
if index not in neighbours:
valid = False
break
for index in incorrect_indices:
if index in neighbours:
valid = False
break
if valid:
new_tables.append(t)
for index in range(table_size - len(new_tables)):
entries = defaultdict(list)
t1 = time()
while True:
correct_signatures, hash_function = calculate_signatures_random_weights(data[correct_indices], window_size=window_size, hash_size=hash_size)
incorrect_signatures, _ = calculate_signatures_random_weights(data[incorrect_indices], hash_function=hash_function)
if correct_signatures.count(correct_signatures[0]) == len(correct_signatures) and incorrect_signatures.count(correct_signatures[0]) == 0:
break
signatures, _ = calculate_signatures_random_weights(data, hash_function=hash_function)
for i in range(len(signatures)):
entries[signatures[i]].append(i)
print(str(index) + ": " + str(time() - t1))
new_tables.append({
"hash": hash_function.tolist(),
"entries": entries
})
print('Update time: ' + str(time() - t0))
response = {}
for table_index in range(table_size):
response[str(table_index)] = {
"hash": hash_functions[table_index],
"entries": tables[table_index]
for table_index in range(len(new_tables)):
response[table_index] = {
"hash": new_tables[table_index]["hash"],
"entries": new_tables[table_index]["entries"]
}
response = orjson.dumps(response)
response = jsonify(response)
return response
@app.route('/query', methods=['POST'])
......@@ -115,27 +209,6 @@ def window():
print("Query done: " + str(time() - t0))
return response
@app.route('/similarity', methods=['POST'])
def similarity():
t0 = time()
raw_data = orjson.loads(request.data)
window = raw_data['query']
tables = raw_data["tables"]
neighbours = []
output = defaultdict(list)
for t in tables.values():
signature_bool = np.dot(window, t["hash"]) > 0
signature = ''.join(['1' if x else '0' for x in signature_bool])
neighbours.extend(t["entries"][signature])
neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items():
output[str(frequency)].append(index)
response = orjson.dumps(output)
print("Similarity done: " + str(time()-t0))
return response
@app.route('/average-progress', methods=['POST'])
def average_progress():
t0 = time()
......@@ -192,76 +265,4 @@ def average_table():
print("Average calculated: " + str(time() - t1))
response = orjson.dumps(output)
print("Averages calculated: " + str(time() - t0))
return response
@app.route('/update', methods=['POST'])
def update():
t0 = time()
print("Start")
raw_data = orjson.loads(request.data)
print("Data loaded: " + str(time() - t0))
data = np.load('processed-data.npy')
label_data = raw_data["labelData"]
tables = raw_data["tables"]
window = raw_data["query"]
window_size = int(raw_data['parameters']["windowsize"])
hash_size = int(raw_data['parameters']["hashsize"])
table_size = int(raw_data['parameters']["tablesize"])
new_tables = []
correct_indices = [int(index) for index, value in label_data.items() if value is True]
incorrect_indices = [int(index) for index, value in label_data.items() if value is False]
print("Initialized: " + str(time() - t0))
for t in tables.values():
valid = True
signature = ''.join((np.dot(window, t["hash"]) > 0).astype('int').astype('str'))
neighbours = t["entries"][signature]
for index in correct_indices:
if index not in neighbours:
valid = False
break
for index in incorrect_indices:
if index in neighbours:
valid = False
break
if valid:
new_tables.append(t)
print("Filtered good tables: " + str(time() - t0))
for index in range(table_size - len(new_tables)):
entries = defaultdict(list)
t1 = time()
while True:
hash_function = np.random.randn(window_size, hash_size)
correct_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
i in
correct_indices]
incorrect_signatures = [''.join((np.dot(data[i], hash_function) > 0).astype('int').astype('str')) for
i
in incorrect_indices]
if correct_signatures.count(correct_signatures[0]) == len(
correct_signatures) and incorrect_signatures.count(
correct_signatures[0]) == 0:
break
print("first: " + str(time() - t1))
t2 = time()
signatures_bool = np.dot(data, hash_function) > 0
signatures = [''.join(['1' if x else '0' for x in lst]) for lst in signatures_bool]
for i in range(len(signatures)):
entries[signatures[i]].append(i)
print("second: " + str(time() - t2))
new_tables.append({
"hash": hash_function.tolist(),
"entries": entries
})
print('Update time: ' + str(time() - t0))
response = {}
for table_index in range(len(new_tables)):
response[table_index] = {
"hash": new_tables[table_index]["hash"],
"entries": new_tables[table_index]["entries"]
}
response = jsonify(response)
return response
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment