Next tuesday 25th january around 21.30 we'll be upgrading to GitLab version 14.7

Commit bb598b63 authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Trying simple hashing with low memory use (not working)

parent 6375d5da
...@@ -17,9 +17,9 @@ export class CacheService { ...@@ -17,9 +17,9 @@ export class CacheService {
private _sliderValue; private _sliderValue;
private _queryWindow; private _queryWindow;
public windowSize = 120; public windowSize = 200;
public nrOfTables = 20; public nrOfTables = 5;
public hashSize = 8; public hashSize = 2;
public stepSize = 200; public stepSize = 200;
public querySelectionMode = true; public querySelectionMode = true;
......
...@@ -126,13 +126,17 @@ ...@@ -126,13 +126,17 @@
<option name="oldMeFiltersMigrated" value="true" /> <option name="oldMeFiltersMigrated" value="true" />
</component> </component>
<component name="WindowStateProjectService"> <component name="WindowStateProjectService">
<state x="686" y="355" width="610" height="403" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1600727680781"> <state x="686" y="355" width="664" height="403" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1601384380854">
<screen x="72" y="27" width="1848" height="1053" /> <screen x="72" y="27" width="1848" height="1053" />
</state> </state>
<state x="686" y="355" width="610" height="403" key="#com.intellij.fileTypes.FileTypeChooser/72.27.1848.1053@72.27.1848.1053" timestamp="1600727680781" /> <state x="686" y="355" width="664" height="403" key="#com.intellij.fileTypes.FileTypeChooser/72.27.1848.1053@72.27.1848.1053" timestamp="1601384380854" />
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl" timestamp="1600726193087"> <state x="721" y="422" width="1200" height="800" key="DiffContextDialog" timestamp="1601242420342">
<screen x="72" y="27" width="1848" height="1053" /> <screen x="72" y="27" width="1848" height="1053" />
</state> </state>
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl/72.27.1848.1053@72.27.1848.1053" timestamp="1600726193087" /> <state x="721" y="422" width="1200" height="800" key="DiffContextDialog/72.27.1848.1053@72.27.1848.1053" timestamp="1601242420342" />
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl" timestamp="1601285087193">
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl/72.27.1848.1053@72.27.1848.1053" timestamp="1601285087193" />
</component> </component>
</project> </project>
\ No newline at end of file
...@@ -13,8 +13,9 @@ import bigwig ...@@ -13,8 +13,9 @@ import bigwig
import bbi import bbi
from bitarray import bitarray from bitarray import bitarray
import _ucrdtw import _ucrdtw
from scipy.sparse import dia_matrix
reload = False reload = True
app = Flask(__name__) app = Flask(__name__)
CORS(app) CORS(app)
...@@ -35,14 +36,38 @@ def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None, ...@@ -35,14 +36,38 @@ def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None,
signatures_int = np.packbits(signatures_bool) signatures_int = np.packbits(signatures_bool)
return signatures_int.tolist(), hash_function return signatures_int.tolist(), hash_function
def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None, hash_function=None): def calculate_signatures_new(data, window_size=None, hash_size=None, hash_function=None):
if hash_function is None: if hash_function is None:
hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose() hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose()
signatures_bool = np.dot(data, hash_function) > 0 if len(data) == len(np.array(hash_function)[:, 0]):
signatures_int = np.packbits(signatures_bool) signatures_bool = np.dot(data, hash_function) > 0
output = signatures_bool.astype(int)[0]
print(output)
return output
print('starting hashing')
t0 = time()
all_signatures = []
batch_size = 20
data = data.transpose()
temp = np.zeros((batch_size, window_size + batch_size - 1))
for h in range(hash_size):
for i in range(batch_size):
temp[i, i:i + window_size] = hash_function[:, h]
print('first: ' + str(time() - t0))
signatures_bool = [np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0 for i in range(0, len(data) - window_size, batch_size)]
# signatures_bool = []
# for i in range(0, len(data) - window_size, batch_size):
# if i % 1000000 == 0:
# print(i)
# signatures_bool.append(np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0)
print('second: ' + str(time() - t0))
all_signatures.append(np.array(signatures_bool).flatten().astype(int))
print('done')
signatures_int = np.packbits(np.stack(np.array(all_signatures), axis=1), axis=0).flatten()
return signatures_int.tolist(), hash_function return signatures_int.tolist(), hash_function
lsh_function = calculate_signatures_cumsum_weights
lsh_function = calculate_signatures_new
@app.route('/', methods=['GET']) @app.route('/', methods=['GET'])
def index(): def index():
...@@ -61,11 +86,10 @@ def read_data(): ...@@ -61,11 +86,10 @@ def read_data():
} }
response = orjson.dumps(response) response = orjson.dumps(response)
print('Data read: ' + str(time()-t0)) print('Data read: ' + str(time()-t0))
query = data[10000:11200] # query = data[12000:24000]
print(query) # loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True)
loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True) # print(data[loc:loc+120])
print(data[loc:loc+120]) # print('found query: ' + str(loc) + '[' + str(time()-t0) + ']')
print('found query: ' + str(loc) + '[' + str(time()-t0) + ']')
return response return response
@app.route('/create-windows', methods=['POST']) @app.route('/create-windows', methods=['POST'])
...@@ -74,14 +98,10 @@ def create_windows(): ...@@ -74,14 +98,10 @@ def create_windows():
if reload: if reload:
raw_data = request.json raw_data = request.json
window_size = int(raw_data['parameters']["windowsize"]) window_size = int(raw_data['parameters']["windowsize"])
data = bigwig.chunk( chromsize = bbi.chromsizes('test.bigWig')['chr1']
'test.bigWig', step_size = chromsize / 10000
12000, data = bigwig.get('test.bigWig', 'chr1', 0, chromsize, 20000000)
int(12000 / window_size), data = (data - np.min(data))/np.ptp(data)
int(12000 / 6),
['chr1'],
verbose=True,
)
print(data.shape) print(data.shape)
np.save('processed-data', data) np.save('processed-data', data)
print('Windows created: ' + str(time()-t0)) print('Windows created: ' + str(time()-t0))
...@@ -116,7 +136,10 @@ def lsh(data, window_size, hash_size, table_size): ...@@ -116,7 +136,10 @@ def lsh(data, window_size, hash_size, table_size):
for index in range(table_size): for index in range(table_size):
signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size) signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size)
table = {k: v for v, k in enumerate(signatures)} print('creating dictionary')
table = defaultdict(list)
for v, k in enumerate(signatures):
table[k].append(v)
tables.append(table) tables.append(table)
tables_hash_function.append(hash_function.tolist()) tables_hash_function.append(hash_function.tolist())
...@@ -135,7 +158,7 @@ def similarity(): ...@@ -135,7 +158,7 @@ def similarity():
for t in tables.values(): for t in tables.values():
signature = lsh_function(window, hash_function=t["hash"]) signature = lsh_function(window, hash_function=t["hash"])
neighbours.extend(t["entries"][signature]) neighbours.extend(t["entries"][str(signature)])
neighbours_with_frequency = dict(Counter(neighbours)) neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items(): for index, frequency in neighbours_with_frequency.items():
output[str(frequency)].append(index) output[str(frequency)].append(index)
...@@ -208,7 +231,7 @@ def query(): ...@@ -208,7 +231,7 @@ def query():
raw_data = orjson.loads(request.data) raw_data = orjson.loads(request.data)
window = raw_data['window'] window = raw_data['window']
if isinstance(window, int): if isinstance(window, int):
output = np.load('processed-data.npy')[window] output = np.load('processed-data.npy')[window:window+12000]
response = orjson.dumps(output.tolist()) response = orjson.dumps(output.tolist())
print("Query done: " + str(time() - t0)) print("Query done: " + str(time() - t0))
return response return response
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment