Commit 0bc4db2d authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Roughly stable version for demo


Former-commit-id: 33ad2457
parent 4b03da69

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.
......@@ -8,7 +8,7 @@ export interface RawData {
export interface LshData {
candidates: number[];
distances: number[];
hash_functions: number[][][][];
hash_functions: number[];// number[][][][];
parameters?: number[];
}
......
......@@ -44,7 +44,9 @@ export class LabelingWindowComponent implements OnInit {
}
async showSamples() {
this.topk = this.state.lshData.candidates.slice(0, this.k);
this.topk = this.state.lshData.candidates
.filter((candidate) => this.state.labels[candidate] !== true)
.slice(0, this.k);
this.subplots = [];
const values = await this.state.getWindow(this.topk);
this.topk.forEach((index, i) => {
......
......@@ -125,6 +125,8 @@ export class ProgressViewComponent implements OnInit {
const width = +svg.attr('width');
const height = +svg.attr('height');
svg.selectAll('*').remove();
const simulation = d3.forceSimulation()
.force('link', d3.forceLink().id((d: any) => d.id))
.force('charge', d3.forceManyBody().strength(100)) // Gravity force
......
......@@ -86,7 +86,7 @@ export class StateService {
}
public createTable() {
const indices: number[] = this.lshData.distances.map((x) => x > 500 ? 100 : Math.floor(x / 5));
const indices: number[] = this.lshData.distances.map((x) => x > 500 ? 100 : Math.floor(x / 10));
const table = {};
this.lshData.candidates.forEach((candidate, index) => {
if (table[indices[index]] === undefined)
......
This diff is collapsed.
......@@ -17,7 +17,7 @@ import cooler
import numpy as np
import os
import pandas as pd
import utils
from sklearn.preprocessing import MinMaxScaler
TILE_SIZE = 1024
......@@ -26,6 +26,13 @@ TILESET_INFO = {"filetype": "bigwig", "datatype": "vector"}
FILE_EXT = {"bigwig", "bw"}
def normalize_data(data, percentile: float = 99.9):
cutoff = np.percentile(data, (0, percentile))
data_norm = np.copy(data)
data_norm[np.where(data_norm < cutoff[0])] = cutoff[0]
data_norm[np.where(data_norm > cutoff[1])] = cutoff[1]
return MinMaxScaler().fit_transform(data_norm)
def is_bigwig(filepath=None, filetype=None):
if filepath is None:
......@@ -286,7 +293,7 @@ def chunk(
)
if normalize:
values[start:end] = utils.normalize(
values[start:end] = normalize_data(
values[start:end], percentile=percentile
)
......
......@@ -81,7 +81,7 @@ def initialize():
query = np.reshape(query, (len(query), 1))
# query = np.repeat(query, repeats=1, axis=1)
r, a, sd = preprocess()
r, a, sd = preprocess(data)
candidates, distances, hf = _lsh.lsh(data, query, r, a, sd)
response = {
......@@ -168,12 +168,12 @@ def table_info():
print("Averages calculated: " + str(time() - t0))
return response
def preprocess():
return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
data = np.load('processed-data.npy')
def preprocess(data):
# return 0.10882589134534404, 3.1202154563478928, 0.9705780396843037
# data = np.load('processed-data.npy')
data = np.array(data, dtype='double')
data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
data = np.repeat(data, repeats=1, axis=1)
# data = np.reshape(data, (int(len(data) / 1), 1, len(data[0])))
# data = np.repeat(data, repeats=1, axis=1)
subset = []
t0 = time()
......@@ -200,7 +200,8 @@ def preprocess():
continue
e = np.linalg.norm(data[index_1] - data[index_2])
eq_distances.append(e)
d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
d = _ucrdtw.ucrdtw(data[index_1], data[index_2], 0.05, False)[1]
# d = dtw.dtw(data[index_1], data[index_2], dist_method="Euclidean", window_type="sakoechiba", window_args={"window_size": 120}).distance
dtw_distances.append(d)
ratios = np.array(dtw_distances)/np.array(eq_distances)
......@@ -214,20 +215,20 @@ def preprocess():
# theta = mean_eq + -2.58 * sd_eq
r = theta / ((a-sd)*math.sqrt(120))
# r = theta / (math.sqrt(120))
print('Mean: ' + mean_dtw)
print('Stdev: ' + sd_dtw)
print('Ratio mean: ' + a)
print('Ratio stdev: ' + sd)
print('Theta: ' + theta)
print('r: ' + r)
print('Mean: ' + str(mean_dtw))
print('Stdev: ' + str(sd_dtw))
print('Ratio mean: ' + str(a))
print('Ratio stdev: ' + str(sd))
print('Theta: ' + str(theta))
print('r: ' + str(r))
print('Preprocessing time: ' + str(time() - t0))
return r, a, sd
def debug_test_lsh():
r, a, sd = preprocess()
data = np.load('processed-data.npy')
r, a, sd = preprocess(data)
create_windows()
query_n = 80503
data = np.load('processed-data.npy')
query = data[query_n] # performDBA(data[[80503, 11514]])
query = np.reshape(query, (len(data[0]), 1))
data= np.array(data, dtype='double')
......@@ -256,4 +257,4 @@ def debug_test_lsh():
accuracy += 1
print(accuracy)
debug_test_lsh()
\ No newline at end of file
# debug_test_lsh()
\ No newline at end of file
......@@ -11,6 +11,7 @@ dependencies:
- flask_cors
- orjson
- dask[dataframe]
- jupyter
- pip
- pip:
- pybbi
......
This diff is collapsed.
'''
/*******************************************************************************
* Copyright (C) 2018 Francois Petitjean
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
'''
from __future__ import division
import numpy as np
from functools import reduce
__author__ ="Francois Petitjean"
def performDBA(series, n_iterations=10):
n_series = len(series)
max_length = reduce(max, map(len, series))
cost_mat = np.zeros((max_length, max_length))
delta_mat = np.zeros((max_length, max_length))
path_mat = np.zeros((max_length, max_length), dtype=np.int8)
medoid_ind = approximate_medoid_index(series,cost_mat,delta_mat)
center = series[medoid_ind]
for i in range(0,n_iterations):
center = DBA_update(center, series, cost_mat, path_mat, delta_mat)
return center
def approximate_medoid_index(series,cost_mat,delta_mat):
if len(series)<=50:
indices = range(0,len(series))
else:
indices = np.random.choice(range(0,len(series)),50,replace=False)
medoid_ind = -1
best_ss = 1e20
for index_candidate in indices:
candidate = series[index_candidate]
ss = sum_of_squares(candidate,series,cost_mat,delta_mat)
if(medoid_ind==-1 or ss<best_ss):
best_ss = ss
medoid_ind = index_candidate
return medoid_ind
def sum_of_squares(s,series,cost_mat,delta_mat):
return sum(map(lambda t:squared_DTW(s,t,cost_mat,delta_mat),series))
def DTW(s,t,cost_mat,delta_mat):
return np.sqrt(squared_DTW(s,t,cost_mat,delta_mat))
def squared_DTW(s,t,cost_mat,delta_mat):
s_len = len(s)
t_len = len(t)
length = len(s)
fill_delta_mat_dtw(s, t, delta_mat)
cost_mat[0, 0] = delta_mat[0, 0]
for i in range(1, s_len):
cost_mat[i, 0] = cost_mat[i-1, 0]+delta_mat[i, 0]
for j in range(1, t_len):
cost_mat[0, j] = cost_mat[0, j-1]+delta_mat[0, j]
for i in range(1, s_len):
for j in range(1, t_len):
diag,left,top =cost_mat[i-1, j-1], cost_mat[i, j-1], cost_mat[i-1, j]
if(diag <=left):
if(diag<=top):
res = diag
else:
res = top
else:
if(left<=top):
res = left
else:
res = top
cost_mat[i, j] = res+delta_mat[i, j]
return cost_mat[s_len-1,t_len-1]
def fill_delta_mat_dtw(center, s, delta_mat):
slim = delta_mat[:len(center),:len(s)]
np.subtract.outer(center, s,out=slim)
np.square(slim, out=slim)
def DBA_update(center, series, cost_mat, path_mat, delta_mat):
options_argmin = [(-1, -1), (0, -1), (-1, 0)]
updated_center = np.zeros(center.shape)
n_elements = np.array(np.zeros(center.shape), dtype=int)
center_length = len(center)
for s in series:
s_len = len(s)
fill_delta_mat_dtw(center, s, delta_mat)
cost_mat[0, 0] = delta_mat[0, 0]
path_mat[0, 0] = -1
for i in range(1, center_length):
cost_mat[i, 0] = cost_mat[i-1, 0]+delta_mat[i, 0]
path_mat[i, 0] = 2
for j in range(1, s_len):
cost_mat[0, j] = cost_mat[0, j-1]+delta_mat[0, j]
path_mat[0, j] = 1
for i in range(1, center_length):
for j in range(1, s_len):
diag,left,top =cost_mat[i-1, j-1], cost_mat[i, j-1], cost_mat[i-1, j]
if(diag <=left):
if(diag<=top):
res = diag
path_mat[i,j] = 0
else:
res = top
path_mat[i,j] = 2
else:
if(left<=top):
res = left
path_mat[i,j] = 1
else:
res = top
path_mat[i,j] = 2
cost_mat[i, j] = res+delta_mat[i, j]
i = center_length-1
j = s_len-1
while(path_mat[i, j] != -1):
updated_center[i] += s[j]
n_elements[i] += 1
move = options_argmin[path_mat[i, j]]
i += move[0]
j += move[1]
assert(i == 0 and j == 0)
updated_center[i] += s[j]
n_elements[i] += 1
return np.divide(updated_center, n_elements)
\ No newline at end of file
"""
Copyright 2018 Novartis Institutes for BioMedical Research Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import base64
import bbi
import cooler
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
TILE_SIZE = 1024
TILESET_INFO = {"filetype": "bigwig", "datatype": "vector"}
FILE_EXT = {"bigwig", "bw"}
def normalize_data(data, percentile: float = 99.9):
cutoff = np.percentile(data, (0, percentile))
data_norm = np.copy(data)
data_norm[np.where(data_norm < cutoff[0])] = cutoff[0]
data_norm[np.where(data_norm > cutoff[1])] = cutoff[1]
return MinMaxScaler().fit_transform(data_norm)
def is_bigwig(filepath=None, filetype=None):
if filepath is None:
return False
if filetype == "bigwig":
return True
filename, file_ext = os.path.splitext(filepath)
if file_ext[1:].lower() in FILE_EXT:
return True
return False
def get_quadtree_depth(chromsizes):
tile_size_bp = TILE_SIZE
min_tile_cover = np.ceil(sum(chromsizes) / tile_size_bp)
return int(np.ceil(np.log2(min_tile_cover)))
def get_zoom_resolutions(chromsizes):
return [2 ** x for x in range(get_quadtree_depth(chromsizes) + 1)][::-1]
def get_chromsizes(bwpath):
"""TODO: replace this with negspy.
Also, return NaNs from any missing chromosomes in bbi.fetch
"""
chromsizes = bbi.chromsizes(bwpath)
chromosomes = cooler.util.natsorted(chromsizes.keys())
return pd.Series(chromsizes)[chromosomes]
def chr2abs(chromsizes, chr: str, start: int, end: int):
"""Convert chromosomal coordinates to absolute coordinates.
Arguments:
chromsizes -- [description]
chr -- [description]
start -- [description]
end -- [description]
Yields:
[type] -- [description]
"""
offset = (np.cumsum(chromsizes) - chromsizes)[chr]
return (offset + start, offset + end)
def abs2chr(chromsizes, start_pos: int, end_pos: int, is_idx2chr: bool = False):
"""Convert absolute coordinates to chromosomal coordinates.
Arguments:
chromsizes {[type]} -- [description]
start_pos {[type]} -- [description]
end_pos {[type]} -- [description]
Yields:
[type] -- [description]
"""
abs_chrom_offsets = np.r_[0, np.cumsum(chromsizes.values)]
cid_lo, cid_hi = (
np.searchsorted(abs_chrom_offsets, [start_pos, end_pos], side="right") - 1
)
rel_pos_lo = start_pos - abs_chrom_offsets[cid_lo]
rel_pos_hi = end_pos - abs_chrom_offsets[cid_hi]
start = rel_pos_lo
def idx2chr(cid):
return chromsizes.index[cid] if is_idx2chr else cid
for cid in range(cid_lo, cid_hi):
yield idx2chr(cid), start, chromsizes[cid]
start = 0
yield idx2chr(cid_hi), start, rel_pos_hi
def get_tile(bwpath, zoom_level, start_pos, end_pos, chromsizes=None):
if chromsizes is None:
chromsizes = get_chromsizes(bwpath)
resolutions = get_zoom_resolutions(chromsizes)
binsize = resolutions[zoom_level]
arrays = []
for cid, start, end in abs2chr(chromsizes, start_pos, end_pos):
n_bins = int(np.ceil((end - start) / binsize))
try:
chrom = chromsizes.index[cid]
clen = chromsizes.values[cid]
x = bbi.fetch(bwpath, chrom, start, end, bins=n_bins, missing=np.nan)
# drop the very last bin if it is smaller than the binsize
if end == clen and clen % binsize != 0:
x = x[:-1]
except IndexError:
# beyond the range of the available chromosomes
# probably means we've requested a range of absolute
# coordinates that stretch beyond the end of the genome
x = np.zeros(n_bins)
arrays.append(x)
return np.concatenate(arrays)
def tiles(bwpath, tile_ids, chromsizes=None):
"""Generate tiles from a bigwig file.
Parameters
----------
tileset: tilesets.models.Tileset object
The tileset that the tile ids should be retrieved from
tile_ids: [str,...]
A list of tile_ids (e.g. xyx.0.0) identifying the tiles
to be retrieved
Returns
-------
tile_list: [(tile_id, tile_data),...]
A list of tile_id, tile_data tuples
"""
generated_tiles = []
for tile_id in tile_ids:
tile_id_parts = tile_id.split(".")
tile_position = list(map(int, tile_id_parts[1:3]))
zoom_level = tile_position[0]
tile_pos = tile_position[1]
if chromsizes is None:
chromsizes = get_chromsizes(bwpath)
max_depth = get_quadtree_depth(chromsizes)
tile_size = TILE_SIZE * 2 ** (max_depth - zoom_level)
start_pos = tile_pos * tile_size
end_pos = start_pos + tile_size
dense = get_tile(bwpath, zoom_level, start_pos, end_pos, chromsizes=chromsizes)
if len(dense):
max_dense = max(dense)
min_dense = min(dense)
else:
max_dense = 0
min_dense = 0
min_f16 = np.finfo("float16").min
max_f16 = np.finfo("float16").max
has_nan = len([d for d in dense if np.isnan(d)]) > 0
if (
not has_nan
and max_dense > min_f16
and max_dense < max_f16
and min_dense > min_f16
and min_dense < max_f16
):
tile_value = {
"dense": base64.b64encode(dense.astype("float16")).decode("utf-8"),
"dtype": "float16",
}
else:
tile_value = {
"dense": base64.b64encode(dense.astype("float32")).decode("utf-8"),
"dtype": "float32",
}
generated_tiles += [(tile_id, tile_value)]
return generated_tiles
def tileset_info(bwpath):
"""Get the tileset info for a bigWig file.
Parameters
----------
bwpath: string
Path to the bigwig file
Returns
-------
tileset_info: {
'min_pos': [],
'max_pos': [],
'max_width': 131072
'tile_size': 1024,
'max_zoom': 7
}
"""
TILE_SIZE = 1024
chromsizes = get_chromsizes(bwpath)
min_tile_cover = np.ceil(np.sum(chromsizes) / TILE_SIZE)
max_zoom = int(np.ceil(np.log2(min_tile_cover)))
tileset_info = {
"min_pos": [0],
"max_pos": [TILE_SIZE * 2 ** max_zoom],
"max_width": TILE_SIZE * 2 ** max_zoom,
"tile_size": TILE_SIZE,
"max_zoom": max_zoom,
}
return tileset_info
def chunk(
bigwig: str,
window_size: int,
resolution: int,
step_size: int,
chroms: list,
normalize: bool = Tr