Commit 4944fbae authored by Kruyff,D.L.W. (Dylan)'s avatar Kruyff,D.L.W. (Dylan)
Browse files

Make prototype compatible for larger data size

parent 12614e4f
......@@ -15,7 +15,10 @@ export class ApiService {
// Read input data
async readFile(): Promise<RawData> {
const response = await fetch('http://127.0.0.1:5000/read-data');
return await response.json();
const temp = await response.json();
const index = JSON.parse(temp.index);
const values = JSON.parse(temp.values).map(Number);
return {index, values};
}
// Split data into windows and normalize
......
<app-overview-window></app-overview-window>
<mat-tab-group animationDuration="0ms">
<mat-tab label="Query">
<div style="display: flex; justify-content: space-between;">
<div style="width: 80%;">
<app-overview-window></app-overview-window>
</div>
<div style="width: 20%;">
<app-query-window></app-query-window>
</mat-tab>
</div>
</div>
<mat-tab-group animationDuration="0ms" (selectedTabChange)="changeTab($event)">
<mat-tab label="Samples">
<app-labeling-window></app-labeling-window>
</mat-tab>
......
import { Component } from '@angular/core';
import {CacheService} from './cache.service';
@Component({
selector: 'app-root',
templateUrl: './app.component.html',
})
export class AppComponent {
constructor(private service: CacheService) {
}
changeTab(tab) {
this.service.currentTab = tab.index;
}
}
......@@ -8,13 +8,14 @@ export class CacheService {
public rawValues: number[];
public rawIndices: string[];
public _windows: number[][];
private _currentTab: number;
private _windows: number[][];
private _query = undefined;
public _labels = {};
public _tables;
public _windowSimilarity;
private _labels = {};
private _tables;
private _windowSimilarity;
public windowSize = 20;
public windowSize = 60;
public nrOfTables = 10;
public hashSize = 10;
......@@ -23,6 +24,7 @@ export class CacheService {
public onNewQuery: EventEmitter<void> = new EventEmitter<void>();
public onNewTables: EventEmitter<void> = new EventEmitter<void>();
public onNewWindows: EventEmitter<void> = new EventEmitter<void>();
public onNewTab: EventEmitter<void> = new EventEmitter<void>();
public initialized: Promise<void>;
......@@ -45,16 +47,19 @@ export class CacheService {
async getRawData(): Promise<void> {
const rawData: RawData = await this.api.readFile();
console.log(rawData);
this.rawIndices = rawData.index;
this.rawValues = rawData.values;
}
async getWindows(): Promise<void> {
this.windows = await this.api.createWindows(this.rawValues, this.parameters);
console.log(this.windows);
}
async createTables(): Promise<void> {
this.tables = await this.api.createTables(this.windows, this.parameters);
console.log(this.tables);
}
async getSimilarWindows(window): Promise<any> {
......@@ -111,6 +116,16 @@ export class CacheService {
return this._windowSimilarity;
}
public set currentTab(v) {
this._currentTab = v;
console.log(this.currentTab);
this.onNewTab.emit();
}
public get currentTab() {
return this._currentTab;
}
public get parameters(): {[parameter: string]: any} {
return {
windowsize: this.windowSize,
......
<plotly-plot *ngIf="showPlot" [data]="data" [layout]="layout" (plotly_click)="clicked($event)"></plotly-plot>
<div style="overflow: auto">
<plotly-plot *ngIf="showPlot" [data]="data" [layout]="layout" (plotly_click)="clicked($event)"></plotly-plot>
</div>
......@@ -8,9 +8,9 @@ import {throwError} from 'rxjs';
styleUrls: ['./overview-window.component.css']
})
export class OverviewWindowComponent implements OnInit {
public defaultColors: string[] = [];
public defaultSizes: number[] = [];
public defaultOpacity: number[] = [];
public defaultColors: string[];
public defaultSizes: number[];
public defaultOpacity: number[];
public showPlot = false;
public data;
......@@ -29,15 +29,14 @@ export class OverviewWindowComponent implements OnInit {
async initializePlot() {
this.service.query = undefined;
for (const _ of this.service.rawValues) {
this.defaultColors.push('#a3a7e4');
this.defaultSizes.push(5);
this.defaultOpacity.push(1);
}
const size = this.service.rawValues.length;
this.defaultColors = Array(size).fill('#a3a7e4');
this.defaultSizes = Array(size).fill(5);
this.defaultOpacity = Array(size).fill(1);
this.data = [{
x: this.service.rawIndices,
y: this.service.rawValues,
type: 'scatter',
type: 'scattergl',
mode: 'markers',
marker: {
size: this.defaultSizes.slice(),
......@@ -47,15 +46,20 @@ export class OverviewWindowComponent implements OnInit {
hovermode: 'closest',
autosize: true,
margin: {
l: 0,
l: 40,
r: 0,
b: 40,
t: 0,
pad: 4
},
height: 200,
xaxis: {
showticklabels: false,
// rangeslider: {}
},
};
this.showPlot = true;
console.log("showing plot");
}
async clicked(clickData) {
......@@ -73,15 +77,31 @@ export class OverviewWindowComponent implements OnInit {
const sizes: number[] = [];
const opacity: number[] = [];
// Similarity
const windowSimilarity = await this.service.getSimilarWindows(this.service.windows[this.service.query]);
for (const frequency in windowSimilarity){
for (const index of windowSimilarity[frequency]) {
colors[index] = this.getColor(Number(frequency) / this.service.nrOfTables);
sizes[index] = (Number(frequency) / this.service.nrOfTables) * 10;
opacity[index] = Number(frequency) / this.service.nrOfTables;
sizes[index] = 5;
opacity[index] = Math.max(Number(frequency) / this.service.nrOfTables, 0.5);
}
}
// Labeled
for (const index in this.service.labels) {
colors[Number(index)] = this.service.labels[index] ? '#4caf50' : '#f44336';
sizes[Number(index)] = 10;
opacity[Number(index)] = 1;
}
// Query
colors[this.service.query] = '#cf00ff';
sizes[this.service.query] = 10;
opacity[this.service.query] = 1;
this.data[0].marker.color = colors;
this.data[0].marker.size = sizes;
this.data[0].marker.opacity = opacity;
}
public getColor(value: number) {
......
.query-container {
margin: auto;
border: 2px solid black;
width: 80%;
display: flex;
justify-content: center;
}
.query-contents {
margin: auto;
}
<div *ngIf="!query">
<div class="query-container">
<div *ngIf="!query">
Select a point in the data to start the similarity search.
</div>
<div *ngIf="query">
</div>
<div *ngIf="query" class="query-contents">
<span style="display: flex; justify-content: center"><b>Current query</b></span>
<plotly-plot [data]="plot.data" [layout]="plot.layout"></plotly-plot>
</div>
</div>
......@@ -34,7 +34,7 @@ export class QueryWindowComponent implements OnInit {
hovermode: 'closest',
autosize: true,
margin: {
l: 30,
l: 50,
r: 30,
t: 30,
pad: 4
......
......@@ -26,7 +26,7 @@ export class TableOverviewComponent implements OnInit {
{
data: [{
x: Object.keys(table.entries).map((hash: string) => {
return hash;
return Number('0b' + hash);
}
),
y: Object.values(table.entries).map((values: number[]) => values.length / this.service.windows.length),
......
This diff is collapsed.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
......@@ -5,6 +5,13 @@ import numpy as np
from flask_cors import CORS
from collections import defaultdict, Counter
from time import time
import dask.dataframe as dd
import os.path
import json
from sklearn import preprocessing
from functools import partial
from itertools import groupby
from multiprocessing import Pool
app = Flask(__name__)
CORS(app)
......@@ -15,34 +22,58 @@ def index():
@app.route('/read-data', methods=['GET'])
def read_data():
df = pd.read_csv("DailyDelhiClimateTrain.csv", index_col=0)
df.index = pd.to_datetime(df.index)
df.sort_index(inplace=True)
meantemp = df.loc[:, 'meantemp'].copy()
filename = 'processed-data.pkl'
if (not os.path.isfile(filename)):
print("start")
df = dd.read_csv("NW_Ground_Stations_2016.csv", usecols=['number_sta', 'date', 't'])
print("read file")
df = df.loc[df['number_sta'] == 14066001]
print("split rows")
df = df.compute()
df.to_pickle(filename)
print("to_pandas")
df = pd.read_pickle(filename)
df.dropna(subset=['t'], inplace=True)
response = {
"index": meantemp.index.values.astype(str).tolist(),
"values": meantemp.values.tolist()
"index": json.dumps(df.loc[:, 'date'].values.astype(str).tolist()),
"values": json.dumps(df.loc[:, 't'].values.astype(str).tolist())
}
print("response ready")
response = jsonify(response)
return response
# @app.route('/read-data', methods=['GET'])
# def read_data():
# df = pd.read_csv("1.csv", index_col=3)
# df.index = pd.to_datetime(df.index)
# df.sort_index(inplace=True)
# meantemp = df.loc[:, 7].copy()
# response = {
# "index": meantemp.index.values.astype(str).tolist(),
# "values": meantemp.values.tolist()
# }
# response = jsonify(response)
# return response
@app.route('/create-windows', methods=['POST'])
def create_windows():
raw_data = request.json
values = raw_data["values"]
window_size = int(raw_data['parameters']["windowsize"])
data = []
for index in range(len(values) - window_size):
window = values[index:index + window_size]
norm = np.linalg.norm(window)
if norm == 0:
data.append(window)
else:
data.append((window / norm).tolist())
response = jsonify(data)
data = [values[i:i+window_size] for i in range(len(values) - window_size)]
data = preprocessing.minmax_scale(data, (-1, 1), axis=1)
response = jsonify(data.tolist())
return response
def fill_table(data, hash_functions, index):
table = defaultdict(list)
signatures = [''.join((np.dot(data[window_index], hash_functions[index]) > 0).astype('int').astype('str')) for window_index in
range(data.shape[0])]
counted_sig = enumerate(signatures)
for i, x in counted_sig:
table[x].append(i)
return table
@app.route('/create-tables', methods=['POST'])
def create_tables():
t0 = time()
......@@ -52,15 +83,17 @@ def create_tables():
hash_size = int(raw_data['parameters']["hashsize"])
table_size = int(raw_data['parameters']["tablesize"])
data = np.array(data)
tables = [defaultdict(list) for _ in range(table_size)]
tables_hash_function = [np.random.randn(window_size, hash_size) for _ in range(table_size)]
for table_index in range(table_size):
table = tables[table_index]
hash_function = tables_hash_function[table_index]
for window_index in range(data.shape[0]):
signature = (np.dot(data[window_index], hash_function) > 0).astype('int')
table[str(signature)].append(window_index)
tables_hash_function = [np.random.uniform(-1, 1, size=(window_size, hash_size)) for _ in range(table_size)]
print('Init time: ' + str(time() - t0))
try:
pool = Pool()
func = partial(fill_table, data, tables_hash_function)
print('Starting pool: ' + str(time() - t0))
tables = pool.map(func, range(table_size))
finally:
pool.close()
pool.join()
print('Creation time: ' + str(time() - t0))
hash_functions = np.array(tables_hash_function).tolist()
......@@ -83,8 +116,8 @@ def query():
output = {}
for t in tables.values():
signature = (np.dot(window, t["hash"]) > 0).astype('int')
neighbours.extend(t["entries"][str(signature)])
signature = ''.join((np.dot(window, t["hash"]) > 0).astype('int').astype('str'))
neighbours.extend(t["entries"][signature])
neighbours_with_frequency = dict(Counter(neighbours))
for index, frequency in neighbours_with_frequency.items():
if not frequency in output:
......@@ -93,6 +126,25 @@ def query():
response = jsonify(output)
return response
def create_valid_table(data, window_size, hash_size, correct_indices, incorrect_indices, index):
entries = defaultdict(list)
while True:
hash_function = np.random.randn(window_size, hash_size)
correct_signatures = [''.join((np.dot(data[index], hash_function) > 0).astype('int').astype('str')) for index in
correct_indices]
incorrect_signatures = [''.join((np.dot(data[index], hash_function) > 0).astype('int').astype('str')) for index
in incorrect_indices]
if correct_signatures.count(correct_signatures[0]) == len(correct_signatures) and incorrect_signatures.count(
correct_signatures[0]) == 0:
break
for window_index in range(data.shape[0]):
signature = ''.join((np.dot(data[window_index], hash_function) > 0).astype('int').astype('str'))
entries[signature].append(window_index)
return {
"hash": hash_function.tolist(),
"entries": entries
}
@app.route('/update', methods=['POST'])
def update():
t0 = time()
......@@ -115,8 +167,8 @@ def update():
for t in tables.values():
valid = True
signature = (np.dot(window, t["hash"]) > 0).astype('int')
neighbours = t["entries"][str(signature)]
signature = ''.join((np.dot(window, t["hash"]) > 0).astype('int').astype('str'))
neighbours = t["entries"][signature]
for index in correct_indices:
if index not in neighbours:
valid = False
......@@ -128,21 +180,15 @@ def update():
if valid:
new_tables.append(t)
for i in range(table_size - len(new_tables)):
entries = defaultdict(list)
while True:
hash_function = np.random.randn(window_size, hash_size)
correct_signatures = [str((np.dot(data[index], hash_function) > 0).astype('int')) for index in correct_indices]
incorrect_signatures = [str((np.dot(data[index], hash_function) > 0).astype('int')) for index in incorrect_indices]
if correct_signatures.count(correct_signatures[0]) == len(correct_signatures) and incorrect_signatures.count(correct_signatures[0]) == 0:
break
for window_index in range(data.shape[0]):
signature = (np.dot(data[window_index], hash_function) > 0).astype('int')
entries[str(signature)].append(window_index)
new_tables.append({
"hash": hash_function.tolist(),
"entries": entries
})
try:
pool = Pool()
func = partial(create_valid_table, data, window_size, hash_size, correct_indices, incorrect_indices)
print('Starting pool: ' + str(time() - t0))
new_tables.extend(pool.map(func, range(table_size - len(new_tables))))
finally:
pool.close()
pool.join()
print('Update time: ' + str(time() - t0))
response = {}
for table_index in range(len(new_tables)):
......
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
Copyright (c) 2017-2020 Ingy döt Net
Copyright (c) 2006-2016 Kirill Simonov
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Metadata-Version: 2.1
Name: PyYAML
Version: 5.3.1
Summary: YAML parser and emitter for Python
Home-page: https://github.com/yaml/pyyaml
Author: Kirill Simonov
Author-email: xi@resolvent.net
License: MIT
Download-URL: https://pypi.org/project/PyYAML/
Platform: Any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Cython
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Text Processing :: Markup
YAML is a data serialization format designed for human readability
and interaction with scripting languages. PyYAML is a YAML parser
and emitter for Python.
PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
support, capable extension API, and sensible error messages. PyYAML
supports standard YAML tags and provides Python-specific tags that
allow to represent an arbitrary Python object.
PyYAML is applicable for a broad range of tasks from complex
configuration files to object serialization and persistence.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment