Commit a224e6da authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Trying simple hashing with low memory use (not working)


Former-commit-id: bb598b63
parent 9cabbe5f
......@@ -17,9 +17,9 @@ export class CacheService {
private _sliderValue;
private _queryWindow;
public windowSize = 120;
public nrOfTables = 20;
public hashSize = 8;
public windowSize = 200;
public nrOfTables = 5;
public hashSize = 2;
public stepSize = 200;
public querySelectionMode = true;
......
......@@ -126,13 +126,17 @@
<option name="oldMeFiltersMigrated" value="true" />
</component>
<component name="WindowStateProjectService">
<state x="686" y="355" width="610" height="403" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1600727680781">
<state x="686" y="355" width="664" height="403" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1601384380854">
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="686" y="355" width="610" height="403" key="#com.intellij.fileTypes.FileTypeChooser/72.27.1848.1053@72.27.1848.1053" timestamp="1600727680781" />
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl" timestamp="1600726193087">
<state x="686" y="355" width="664" height="403" key="#com.intellij.fileTypes.FileTypeChooser/72.27.1848.1053@72.27.1848.1053" timestamp="1601384380854" />
<state x="721" y="422" width="1200" height="800" key="DiffContextDialog" timestamp="1601242420342">
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl/72.27.1848.1053@72.27.1848.1053" timestamp="1600726193087" />
<state x="721" y="422" width="1200" height="800" key="DiffContextDialog/72.27.1848.1053@72.27.1848.1053" timestamp="1601242420342" />
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl" timestamp="1601285087193">
<screen x="72" y="27" width="1848" height="1053" />
</state>
<state x="779" y="311" width="424" height="491" key="FileChooserDialogImpl/72.27.1848.1053@72.27.1848.1053" timestamp="1601285087193" />
</component>
</project>
\ No newline at end of file
......@@ -13,8 +13,9 @@ import bigwig
import bbi
from bitarray import bitarray
import _ucrdtw
from scipy.sparse import dia_matrix
reload = False
reload = True
app = Flask(__name__)
CORS(app)
......@@ -35,14 +36,38 @@ def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None,
signatures_int = np.packbits(signatures_bool)
return signatures_int.tolist(), hash_function
def calculate_signatures_cumsum_weights(data, window_size=None, hash_size=None, hash_function=None):
def calculate_signatures_new(data, window_size=None, hash_size=None, hash_function=None):
if hash_function is None:
hash_function = np.array([np.cumsum(np.random.uniform(-1, 1, window_size)) for _ in range(hash_size)]).transpose()
signatures_bool = np.dot(data, hash_function) > 0
signatures_int = np.packbits(signatures_bool)
if len(data) == len(np.array(hash_function)[:, 0]):
signatures_bool = np.dot(data, hash_function) > 0
output = signatures_bool.astype(int)[0]
print(output)
return output
print('starting hashing')
t0 = time()
all_signatures = []
batch_size = 20
data = data.transpose()
temp = np.zeros((batch_size, window_size + batch_size - 1))
for h in range(hash_size):
for i in range(batch_size):
temp[i, i:i + window_size] = hash_function[:, h]
print('first: ' + str(time() - t0))
signatures_bool = [np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0 for i in range(0, len(data) - window_size, batch_size)]
# signatures_bool = []
# for i in range(0, len(data) - window_size, batch_size):
# if i % 1000000 == 0:
# print(i)
# signatures_bool.append(np.dot(temp, data[i:i + window_size + batch_size - 1]) > 0)
print('second: ' + str(time() - t0))
all_signatures.append(np.array(signatures_bool).flatten().astype(int))
print('done')
signatures_int = np.packbits(np.stack(np.array(all_signatures), axis=1), axis=0).flatten()
return signatures_int.tolist(), hash_function
lsh_function = calculate_signatures_cumsum_weights
lsh_function = calculate_signatures_new
@app.route('/', methods=['GET'])
def index():
......@@ -61,11 +86,10 @@ def read_data():
}
response = orjson.dumps(response)
print('Data read: ' + str(time()-t0))
query = data[10000:11200]
print(query)
loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True)
print(data[loc:loc+120])
print('found query: ' + str(loc) + '[' + str(time()-t0) + ']')
# query = data[12000:24000]
# loc, dist = _ucrdtw.ucrdtw(data, query, 0.05, True)
# print(data[loc:loc+120])
# print('found query: ' + str(loc) + '[' + str(time()-t0) + ']')
return response
@app.route('/create-windows', methods=['POST'])
......@@ -74,14 +98,10 @@ def create_windows():
if reload:
raw_data = request.json
window_size = int(raw_data['parameters']["windowsize"])
data = bigwig.chunk(
'test.bigWig',
12000,
int(12000 / window_size),
int(12000 / 6),
['chr1'],
verbose=True,
)
chromsize = bbi.chromsizes('test.bigWig')['chr1']
step_size = chromsize / 10000
data = bigwig.get('test.bigWig', 'chr1', 0, chromsize, 20000000)
data = (data - np.min(data))/np.ptp(data)
print(data.shape)
np.save('processed-data', data)
print('Windows created: ' + str(time()-t0))
......@@ -116,7 +136,10 @@ def lsh(data, window_size, hash_size, table_size):
for index in range(table_size):
signatures, hash_function = lsh_function(data, window_size=window_size, hash_size=hash_size)
table = {k: v for v, k in enumerate(signatures)}
print('creating dictionary')
table = defaultdict(list)
for v, k in enumerate(signatures):
table[k].append(v)
tables.append(table)
tables_hash_function.append(hash_function.tolist())
......@@ -135,7 +158,7 @@ def similarity():
for t in tables.values():
signature = lsh_function(window, hash_function=t["hash"])
neighbours.extend(t["entries"][signature])
neighbours.extend(t["entries"][str(signature)])
neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items():
output[str(frequency)].append(index)
......@@ -208,7 +231,7 @@ def query():
raw_data = orjson.loads(request.data)
window = raw_data['window']
if isinstance(window, int):
output = np.load('processed-data.npy')[window]
output = np.load('processed-data.npy')[window:window+12000]
response = orjson.dumps(output.tolist())
print("Query done: " + str(time() - t0))
return response
......
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment