Commit 69f8e465 authored by DylanKruyff's avatar DylanKruyff
Browse files

Initial commit: ABC, SAX, random vector notebooks + web-app prototype

parents
*.bigWig filter=lfs diff=lfs merge=lfs -text
*.csv filter=lfs diff=lfs merge=lfs -text
*.docx filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.mmap filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.pdf filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.png filter=lfs diff=lfs merge=lfs -text
*.xlsx filter=lfs diff=lfs merge=lfs -text
Documents/
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ABC-Hashing.iml" filepath="$PROJECT_DIR$/.idea/ABC-Hashing.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="f3606558-684d-4ca9-ac80-49a9efbfd85f" name="Default Changelist" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/run.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1428">
<caret line="92" column="15" selection-start-line="92" selection-start-column="15" selection-end-line="92" selection-end-column="15" />
<folding>
<element signature="e#82#92#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/file_io.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/dataset.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="476">
<caret line="31" column="13" selection-start-line="31" selection-start-column="13" selection-end-line="31" selection-end-column="13" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/network_discovery.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="122" selection-start-line="122" selection-end-line="138" selection-end-column="38" />
<folding>
<element signature="e#66#79#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>bit</find>
<find>bitlist_to_int</find>
</findStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/run.py" />
<option value="$PROJECT_DIR$/network_discovery.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="-264" />
<option name="y" value="-36" />
<option name="width" value="1890" />
<option name="height" value="960" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="ABC-Hashing" type="b2602c69:ProjectViewProjectNode" />
<item name="ABC-Hashing" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="f3606558-684d-4ca9-ac80-49a9efbfd85f" name="Default Changelist" comment="" />
<created>1592769549181</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1592769549181</updated>
<workItem from="1592769555026" duration="6121000" />
<workItem from="1592927512487" duration="1950000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="8071000" />
</component>
<component name="ToolWindowManager">
<frame x="-7" y="-7" width="1295" height="695" extended-state="6" />
<editor active="true" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25383994" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Version Control" order="8" />
<window_info anchor="bottom" id="Database Changes" order="9" />
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
<window_info anchor="bottom" id="Terminal" order="11" />
<window_info anchor="bottom" id="Python Console" order="12" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="SciView" order="3" />
<window_info anchor="right" id="Database" order="4" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/run.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1428">
<caret line="92" column="15" selection-start-line="92" selection-start-column="15" selection-end-line="92" selection-end-column="15" />
<folding>
<element signature="e#82#92#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/file_io.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/dataset.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="476">
<caret line="31" column="13" selection-start-line="31" selection-start-column="13" selection-end-line="31" selection-end-column="13" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/network_discovery.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="122" selection-start-line="122" selection-end-line="138" selection-end-column="38" />
<folding>
<element signature="e#66#79#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
"""
For keeping track of the different kinds of data in the project:
- Raw fMRI data
- Generated graph data
- Evaluation data (statistics, feature vectors)
"""
import os
import numpy as np
import networkx as nx
from file_io import *
class Dataset(object):
def __init__(self, subdir, data_file=None):
"""
Parameters
----------
subdir : str
Where the data live (one file, multiple files, files of various extensions, etc)
data_file : str
Optionally, if all data live in a single file in the subdir
"""
self.subdir = subdir
if self.subdir[-1] != '/':
self.subdir += '/'
self.data_file = data_file
def load_data_file(self, delimiter=None):
"""If data are in a single file, load it"""
if self.data_file is not None:
return np.loadtxt(self.subdir + self.data_file, delimiter=delimiter)
else:
raise ValueErorr('No data file to load')
def gen_data(self, delimiter=None):
"""Generator of each data file in the specified directory"""
for fname in os.listdir(self.subdir):
yield np.loadtxt(self.subdir + fname, delimiter=delimiter)
@property
def filenames_with_paths(self):
"""For getting filenames with relative paths"""
return [self.subdir + f for f in sorted(os.listdir(self.subdir))]
class UCRDataset(Dataset):
def __init__(self, subdir, data_file):
super().__init__(subdir, data_file=data_file)
def read_data(self):
labels, series = [], []
if self.subdir[-1] != '/':
self.subdir += '/'
lines = get_lines_in_file(self.subdir + self.data_file)
lines = [list(map(float, line.split(','))) for line in lines]
labels = [line[0] for line in lines]
data = [line[1:] for line in lines]
return np.array(labels), np.array(data)
class SyntheticDataset(Dataset):
def __init__(self, subdir='data/', data_file='synth.txt'):
super().__init__(subdir, data_file)
def gen_data(self, N, num_samples, write_to_file=True):
"""Create some synthetic data, each sample of size N"""
A = np.random.rand(N, N)
cov = np.dot(A, A.T)
mean = np.zeros(N)
data = np.random.multivariate_normal(mean, cov, size=num_samples)
if write_to_file:
write_matrix_to_csv(self.subdir + self.data_file, data)
return data
class FMRIDataset(Dataset):
def __init__(self, subdir, data_file=None):
super().__init__(subdir, data_file=data_file)
def subject_id_from_filename(f):
"""Subject id is before the file extension, separated by _"""
return f.split('_')[-1].split('.')[0]
class COBREDataset(FMRIDataset):
def __init__(self, subdir='data/cobre/',
data_file='Schiz_COBRE_1166_p50f0b_Danai.mat',
label_file='Schiz_COBRE_MDF_Danai.csv'):
"""All data are in a single mat file. Also have an associated label/annotation file"""
super().__init__(subdir, data_file)
self.label_file = label_file
@property
def labels(self):
"""Returns [<healthy ids>], [<unhealthy_ids>]"""
subject, status = 'Subject', 'Dx'
control, patient = 'Control', 'Patient'
data = data_as_pd(self.subdir + self.label_file, [subject, status])
id_column, health_column = data[subject], data[status]
healthy = ['00' + str(id_column[i]) for i in range(len(data)) if health_column[i] == control]
unhealthy = ['00' + str(id_column[i]) for i in range(len(data)) if health_column[i] == patient]
return healthy, unhealthy
def gen_data(self):
data = data_from_mat(self.subdir + self.data_file)['data']
for i in range(data.shape[0]):
yield '{0}'.format(data[i].Subject), data[i].roiTC.T
class PennDataset(FMRIDataset):
SCORE_COLUMNS = ['Complex Cognition', 'Memory', 'Social Cognition']
def __init__(self, subdir='data/penn/',
data_file=None, score_file='penn_scores.csv'):
super().__init__(subdir, data_file)
self.score_file = score_file
@property
def scores(self):
"""Returns dictionary of { subject_id : [complex cognition, memory, social cognition] } scores"""
data = data_as_pd(self.score_file)
col1, col2, col3 = data[PennDataset.SCORE_COLUMNS[0]], \
data[PennDataset.SCORE_COLUMNS[1]], \
data[PennDataset.SCORE_COLUMNS[2]]
subject_id_column = data['subject_id']
subject_scores = {}
for i in range(len(subject_id_column)):
subject_id = str(int(subject_id_column[i]))
subject_scores[subject_id] = [ col1[i], col2[i], col3[i] ]
return subject_scores
def gen_data(self):
files = [f for f in self.filenames_with_paths if f[-3:] == 'mat']
for mat in files:
yield Dataset.subject_id_from_filename(mat), data_from_mat(mat)['roiTC'].T
class GraphDataset(Dataset):
def __init__(self, subdir):
super().__init__(subdir, data_file=None)
def gen_graphs(self, ext='', dict_format=False):
"""Returns a generator of subject_id, nx graph pairs"""
files = self.filenames_with_paths
for f in files:
subject_id = Dataset.subject_id_from_filename(f)
G = parse_edgelist(f, ext=ext) if dict_format else nx_from_edgelist(f, ext=ext)
yield subject_id, G
class COBREGraphDataset(GraphDataset):
"""Generated COBRE graphs"""
GRAPH_DIR = '/y/DATA/schiz_graphs/'
def __init__(self, subdir):
super().__init__(COBREGraphDataset.GRAPH_DIR + subdir)
class PennGraphDataset(GraphDataset):
"""Generated Penn graphs"""
GRAPH_DIR = '/y/DATA/penn_graphs/'
def __init__(self, subdir):
super().__init__(PennGraphDataset.GRAPH_DIR + subdir)
class FeatureDataset(Dataset):
def __init__(self, subdir, data_file):
super().__init__(subdir, data_file)
def _split_ids_and_features(self, id_column):
"""Returns the ID column as a separate DF from the features"""
df = data_as_pd(self.subdir + self.data_file)
subject_id_column = df[id_column]
X = df.drop([id_column], axis=1).as_matrix() # feature vectors
return subject_id_column, X
@property
def X_y(self):
"""Return feature vectors and label(s)"""
raise NotImplementedError
class COBREFeatureDataset(FeatureDataset):
def __init__(self, subdir, data_file):
super().__init__(subdir, data_file)
@property
def X_y(self):
"""Returns X (features), y (target).
Features are BINARY"""
subject_id_column, X = self._split_ids_and_features('subject_id')
y = np.zeros(len(subject_id_column))
healthy, unhealthy = COBREDataset().labels
healthy, unhealthy = set(healthy), set(unhealthy)
for i in range(len(subject_id_column)):
subject_id = '00{0}'.format(subject_id_column[i])
if subject_id in healthy:
y[i] = 1
elif subject_id in unhealthy:
y[i] = 0
return X, y
def main():
"""Testing code"""
pass
if __name__ == '__main__':
main()
"""
For file I/O on:
- numpy matrices
- pandas DataFrames
- mat files
- networkx graphs
- etc
"""
import csv
import os
import json
import numpy as np
import scipy.io as sio
import pandas as pd
import networkx as nx
from networkx.readwrite import json_graph
def get_lines_in_file(filename):
"""Return array of lines from file"""
with open(filename) as f:
lines = [line.rstrip('\n') for line in f.readlines()]
return lines
def data_as_np(filename, delimiter=',', skip_header=0):
"""Returns data as Numpy array"""
return np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header)
def data_as_pd(filename, keep_columns=None, drop_columns=None):
"""Return data as a pandas DataFrame"""
df = pd.read_csv(filename)
if keep_columns is not None:
df = df[keep_columns]
if drop_columns is not None:
df = df.drop(drop_columns, axis=1)
return df
def nx_from_edgelist(filename, delimiter=',', ext='.csv'):
return nx.read_weighted_edgelist(filename + ext, delimiter=delimiter, nodetype=int)
def parse_edgelist(filename, ext=''):
"""Parses line-by-line edgelist"""
with open(filename + ext) as f:
lines = get_lines_in_file(filename)
return nx.parse_edgelist(lines, nodetype=int)
def data_from_mat(filename):
"""Get data from .mat file"""
return _load_mat(filename)
def _load_mat(filename):
"""
This function should be called instead of direct sio.loadmat
as it cures the problem of not properly recovering python dictionaries
from mat files. It calls the function check keys to cure all entries
which are still mat-objects.
Source for this function and the functions it calls:
http://stackoverflow.com/questions/7008608/scipy-io-loadmat-nested-structures-i-e-dictionaries
"""
data = sio.loadmat(filename, struct_as_record=False, squeeze_me=True)
return _check_keys(data)
def _check_keys(d):
"""
Checks if entries in dictionary are mat-objects. If yes,
todict is called to change them to nested dictionaries.
"""
for key in d:
if isinstance(d[key], sio.matlab.mio5_params.mat_struct):
d[key] = _todict(d[key])
return d
def _todict(matobj):
"""
A recursive function which constructs from matobjects nested dictionaries.
"""
d = {}
for strg in matobj._fieldnames:
elem = matobj.__dict__[strg]
if isinstance(elem, sio.matlab.mio5_params.mat_struct):
d[strg] = _todict(elem)
else:
d[strg] = elem
return d
def write_matrix_to_csv(filename, data):
"""Writes whole matrix to CSV"""
np.savetxt(filename, data, delimiter=',')
def write_row_to_csv(filename, row):
"""row is a list of items to write to the csv"""
with open(filename, 'a+') as f:
f.write(_list_to_csv_string(row))
def write_rows_to_csv(filename, rows, header=None):
"""Writes a list of lists (rows) to the CSV with an optional header"""
with open(filename, 'w') as f:
if header is not None:
f.write(_list_to_csv_string(header))
for row in rows:
f.write(_list_to_csv_string(row))
def write_column_to_csv(filename, data, header):
"""Adds a column to a CSV file"""
csv_input = pd.read_csv(filename)
csv_input[header] = data
csv_input.to_csv(filename, index=False)
def _list_to_csv_string(l, delimiter=','):
"""List to delimited string with newline"""
return delimiter.join(map(str, l)) + '\n'
def write_edgelist(filename, G, delimiter=',', ext='.csv'):
"""Writes the edge list to a CSV file"""
nx.write_weighted_edgelist(G, filename + ext, delimiter=delimiter)
def write_json_edgelist(filename, G, ext='.json'):
"""Converts nx graph to JSON and writes"""
json_data = json_graph.node_link_data(G)
s = json.dumps(json_data)
with open(filename + ext, 'w') as f:
f.write(s)
"""
Network discovery methods and discovered network analysis
"""
import random
import inspect
import math
import itertools
import numpy as np
import networkx as nx
from sklearn import preprocessing
from scipy import spatial, stats
from collections import defaultdict
def binarize(data):