Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ CanDI/misc.py
CanDI/setup/data/depmap
CanDI/setup/data/ctrp
CanDI/setup/data/coessentiality
CanDI/pipelines/coessentiality

*.swp
tests.py
Expand Down
2 changes: 1 addition & 1 deletion CanDI/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = "0.2.5"
version = "0.2.6"
3 changes: 1 addition & 2 deletions CanDI/setup/data/config.draft.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[defaults]
sectionlist = ["download_urls", "defaults", "downloads", "formatted", "index", "data_paths", "autoload_info"]
downloads = ["depmap"]
depmap = ["sample_info", "gene_effect", "gene_dependency", "rnaseq_reads", "gene_cn", "mutations", "expression", "fusions"]
depmap = ["sample_info", "gene_effect", "gene_dependency", "rnaseq_reads", "gene_cn", "mutations", "expression", "fusions", "PRISM_fold_change_viability"]

[index]
gene_effect = gene
Expand All @@ -14,7 +14,6 @@ counts = gene

[autoload_info]
cell_lines = data/depmap/sample_info.csv
genes = data/genes/gene_info.csv
locations = data/locations/merged_locations.csv

[download_urls]
Expand Down
77,433 changes: 0 additions & 77,433 deletions CanDI/setup/data/genes/gene_info.csv

This file was deleted.

24 changes: 3 additions & 21 deletions CanDI/setup/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,6 @@

### Datasets Metadata ###

coessentiality_dataset_names = [
'genes',
# 10273535
'GLS_p',
# 10273534
'GLS_sign',
# 10273533
]

depmap_dataset_names = [
'CCLE_expression',
'CCLE_fusions',
Expand All @@ -29,15 +20,12 @@
'CCLE_RNAseq_reads',
'CRISPR_gene_dependency',
'CRISPR_gene_effect',
'PRISM_fold_change_viability',
'sample_info',
'README',
]

name2type = {
# Coessentiality datasets
'genes': 'txt',
'GLS_p': 'npy',
'GLS_sign': 'npy',
# DepMap datasets
'CCLE_expression': 'csv',
'CCLE_fusions': 'csv',
Expand All @@ -46,6 +34,7 @@
'CCLE_RNAseq_reads': 'csv',
'CRISPR_gene_dependency': 'csv',
'CRISPR_gene_effect': 'csv',
'PRISM_fold_change_viability': 'h5ad.gz',
'sample_info': 'csv',
'README': 'txt',
}
Expand All @@ -63,6 +52,7 @@
'CCLE_RNAseq_reads': 8076859,
'CRISPR_gene_dependency': 8076863,
'CRISPR_gene_effect': 8076860,
'PRISM_fold_change_viability': 11854823,
'sample_info': 10085764,
'README': 8151459,
}
Expand Down Expand Up @@ -169,11 +159,3 @@ def __init__(self):

def download(self, path, return_type=None):
return self.run(path, depmap_dataset_names, return_type)


class CoessentialityDownloader(Downloader):
def __init__(self):
super().__init__()

def download(self, path, return_type=None):
return self.run(path, coessentiality_dataset_names, return_type)
20 changes: 10 additions & 10 deletions CanDI/setup/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def main():
if args.database == 'depmap':
if args.source == 'dataverse':
print("Downloading data from Dataverse")
m = manager.DataverseDepMap(manager_path=args.directory, verbose=True)
m = manager.DataverseBroadDepMap(manager_path=args.directory, verbose=True)
m.download_reformatted_data()
m.write_config(m.cfig_path, m.parser)

Expand All @@ -30,15 +30,15 @@ def main():
raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'")

if args.database == 'coessentiality':
if args.source == 'dataverse':
print("Downloading data from Dataverse")
m = manager.DataverseCoessentiality(manager_path=args.directory, verbose=True)
m.download_raw_files()
m.coessentiality_autoformat()
m.write_config(m.cfig_path, m.parser)

else:
raise ValueError("Invalid source. Coessentiality data is only available on `dataverse`!")
raise ValueError("Coessentiality data is currently experimental and not supported in this version.")
# if args.source == 'dataverse':
# print("Downloading data from Dataverse")
# m = manager.DataverseCoessentiality(manager_path=args.directory, verbose=True)
# m.download_raw_files()
# m.coessentiality_autoformat()
# m.write_config(m.cfig_path, m.parser)
# else:
# raise ValueError("Invalid source. Coessentiality data is only available on `dataverse`!")


if __name__ == "__main__":
Expand Down
153 changes: 77 additions & 76 deletions CanDI/setup/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ def write_config(cfig_path, parser):
f.close()


class DataverseDepMap(Manager):
class DataverseBroadDepMap(Manager):
def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
super().__init__(manager_path, cfig_path, verbose)
self.release = '21Q4' # default release uploded to CanDI dataverse
self.depmap_release = '21Q4' # default release uploded to CanDI dataverse
self.prism_release = '24Q2' # default release uploded to CanDI dataverse
self.download_source = 'dataverse, ' + dataverse.CANDI_DATAVERSE_DOI

def download_reformatted_data(self):
Expand Down Expand Up @@ -118,7 +119,7 @@ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
super().__init__(manager_path, cfig_path, verbose)
self.download_source = 'Broad DepMap, https://depmap.org/'

def get_depmap_info(self, release="latest"):
def get_depmap_info(self, depmap_release="latest"):

depmap = self.parser["download_urls"]["depmap"]
print("Getting download information from DepMap")
Expand All @@ -127,7 +128,7 @@ def get_depmap_info(self, release="latest"):
print("GET Successful")

self.response = response.json()
self.release = self.get_release(release)
self.depmap_release = self.get_release(depmap_release)
self.download_info, self.depmap_files = self.parse_release()
self.parser["depmap_urls"] = self.download_info
self.parser["depmap_files"] = self.depmap_files
Expand All @@ -138,7 +139,7 @@ def parse_release(self):
depmap_files = {}
for table in self.response["table"]:

if self.release == table["releaseName"] and table["downloadUrl"]:
if self.depmap_release == table["releaseName"] and table["downloadUrl"]:

download_urls[table["fileName"]] = table["downloadUrl"]
depmap_files[self.format_filename(table["fileName"])] = table["fileName"]
Expand All @@ -157,12 +158,12 @@ def get_release(self, release):

return release_info["releaseName"]

def format_filename(self, filename, release):
def format_filename(self, filename, depmap_release):

# set candi_name to the filename without the extension
candi_name = filename.split(".")[0]

if release == "21Q4":
if depmap_release == "21Q4":
if "CRISPR_" in candi_name:
candi_name = candi_name[len("CRISPR_"):]
elif "CCLE_" in candi_name:
Expand Down Expand Up @@ -258,9 +259,9 @@ def depmap_autoformat(self):
df = pd.read_csv(v, low_memory=False, memory_map=True)
self.format_depmap_data(df, v)

def format_depmap_data(self, df, path, release):
def format_depmap_data(self, df, path, depmap_release):

if release == "21Q4":
if depmap_release == "21Q4":
if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns):

df.rename(columns = lambda s: s.split(" ")[0], inplace=True)
Expand Down Expand Up @@ -318,87 +319,87 @@ def sanger_download():
pass


class DataverseCoessentiality(Manager):
def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
super().__init__(manager_path, cfig_path, verbose)
self.download_source = 'Dataverse'
self.reference = 'https://github.com/kundajelab/coessentiality'
self.verbose = verbose
# class DataverseCoessentiality(Manager):
# def __init__(self, manager_path='auto', cfig_path='auto', verbose=False):
# super().__init__(manager_path, cfig_path, verbose)
# self.download_source = 'Dataverse'
# self.reference = 'https://github.com/kundajelab/coessentiality'
# self.verbose = verbose

def download_raw_files(self):
if not os.path.exists(self.manager_path + '/data/'):
os.makedirs(self.manager_path + '/data/')
# def download_raw_files(self):
# if not os.path.exists(self.manager_path + '/data/'):
# os.makedirs(self.manager_path + '/data/')

if not os.path.exists(self.manager_path + '/data/coessentiality/'):
os.makedirs(self.manager_path + '/data/coessentiality/')
# if not os.path.exists(self.manager_path + '/data/coessentiality/'):
# os.makedirs(self.manager_path + '/data/coessentiality/')

session = dataverse.CoessentialityDownloader()
urls, file_names = session.download(
self.manager_path + '/data/coessentiality/',
return_type= ["url", "name"]
)
# session = dataverse.CoessentialityDownloader()
# urls, file_names = session.download(
# self.manager_path + '/data/coessentiality/',
# return_type= ["url", "name"]
# )

self.urls = urls
self.file_names = file_names
# self.urls = urls
# self.file_names = file_names

def _load_coessentiality_matrix(self):
data_dir = f'{self.manager_path}/data/coessentiality'
# def _load_coessentiality_matrix(self):
# data_dir = f'{self.manager_path}/data/coessentiality'

gene_names = pd.read_csv(
f'{data_dir}/genes.txt',header=None,names=['gene_name']
)['gene_name']
# gene_names = pd.read_csv(
# f'{data_dir}/genes.txt',header=None,names=['gene_name']
# )['gene_name']

GLS_sign = np.load(f'{data_dir}/GLS_sign.npy')
GLS_p = np.load(f'{data_dir}/GLS_p.npy')
# GLS_sign = np.load(f'{data_dir}/GLS_sign.npy')
# GLS_p = np.load(f'{data_dir}/GLS_p.npy')

self.matrix = pl.from_dataframe(
pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index()
)

def _get_coessentiality_df(self, pvalue_threshold = 10**-3):
df = self.matrix.melt('gene_name')
df.columns = ['gene_1','gene_2','coessentiality']
df = df.filter(~(pl.col('gene_1') == pl.col('gene_2')))
df = df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold))
# self.matrix = pl.from_dataframe(
# pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index()
# )

# def _get_coessentiality_df(self, pvalue_threshold = 10**-3):
# df = self.matrix.melt('gene_name')
# df.columns = ['gene_1','gene_2','coessentiality']
# df = df.filter(~(pl.col('gene_1') == pl.col('gene_2')))
# df = df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold))

self.df = df
self.pvalue_threshold = pvalue_threshold
# self.df = df
# self.pvalue_threshold = pvalue_threshold

def coessentiality_autoformat(self):
# def coessentiality_autoformat(self):

coessentiality_matrix_path = f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv'
coessentiality_df_path = f'{self.manager_path}/data/coessentiality/coessentiality_df.csv'
# coessentiality_matrix_path = f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv'
# coessentiality_df_path = f'{self.manager_path}/data/coessentiality/coessentiality_df.csv'

# Check if the data has already been formatted or run the formatting
if os.path.exists(coessentiality_matrix_path):
if self.verbose: print("coessentiality_matrix.csv already exists")
# # Check if the data has already been formatted or run the formatting
# if os.path.exists(coessentiality_matrix_path):
# if self.verbose: print("coessentiality_matrix.csv already exists")

else:
if self.verbose: print("Building Coessentiality Matrix ...", end=' ')
self._load_coessentiality_matrix()
self.matrix.to_pandas().to_csv(coessentiality_matrix_path)
if self.verbose: print("Done!")
# else:
# if self.verbose: print("Building Coessentiality Matrix ...", end=' ')
# self._load_coessentiality_matrix()
# self.matrix.to_pandas().to_csv(coessentiality_matrix_path)
# if self.verbose: print("Done!")

if os.path.exists(coessentiality_df_path):
if self.verbose: print("coessentiality_df.csv already exists")
# if os.path.exists(coessentiality_df_path):
# if self.verbose: print("coessentiality_df.csv already exists")

else:
if self.verbose: print("Building Coessentiality DataFrame ...", end=' ')
self._get_coessentiality_df()
self.df.to_pandas().to_csv(coessentiality_df_path)
if self.verbose: print("Done!")
# else:
# if self.verbose: print("Building Coessentiality DataFrame ...", end=' ')
# self._get_coessentiality_df()
# self.df.to_pandas().to_csv(coessentiality_df_path)
# if self.verbose: print("Done!")

# Update the config file
self.parser['data_paths'].update({
'coessentiality': 'data/coessentiality/'
})

self.parser['formatted'].update({
'coessentiality_matrix.csv': coessentiality_matrix_path,
'coessentiality_df.csv': coessentiality_df_path
})
# # Update the config file
# self.parser['data_paths'].update({
# 'coessentiality': 'data/coessentiality/'
# })

# self.parser['formatted'].update({
# 'coessentiality_matrix.csv': coessentiality_matrix_path,
# 'coessentiality_df.csv': coessentiality_df_path
# })

self.parser['depmap_files'].update({
'coessentiality': coessentiality_df_path,
'coessentiality_matrix': coessentiality_matrix_path,
})
# self.parser['depmap_files'].update({
# 'coessentiality': coessentiality_df_path,
# 'coessentiality_matrix': coessentiality_matrix_path,
# })
18 changes: 8 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pip install PyCanDI && candi-install

Downloaded and formatted datasets would organize this way:

```
``` text
.
├── config.ini # modified after Installation
├── depmap
Expand All @@ -32,8 +32,6 @@ Downloaded and formatted datasets would organize this way:
│ ├── CRISPR_gene_dependency.csv
│ ├── CRISPR_gene_effect.csv
│ └── sample_info.csv
├── genes
│ └── gene_info.csv
└── locations
└── merged_locations.csv
```
Expand All @@ -55,18 +53,18 @@ from CanDI import candi

### CanDI Objects

- `data` : Container for all candi datasets. All access to datasets go
- `data` : Container for all candi datasets. All access to datasets go
through data object.
- `Gene` : Provides cross dataset indexing from the gene perspective.
- `CellLine` : Provides cross dataset indexing from the cell line
- `Gene` : Provides cross dataset indexing from the gene perspective.
- `CellLine` : Provides cross dataset indexing from the cell line
perspective.
- `Cancer` : Provides cross dataset indexing by a group of cell lines
- `Cancer` : Provides cross dataset indexing by a group of cell lines
that are all the same tissue.
- `Organelle`: Provides cross dataset indexing for a group of genes
- `Organelle`: Provides cross dataset indexing for a group of genes
whose proteins localize to the same organelle.
- `CellLineCluster` : Provides cross dataset indexing for a group of
- `CellLineCluster` : Provides cross dataset indexing for a group of
user defined cell lines.
- `GeneCluster` : Provides cross dataset indexing for a group of user
- `GeneCluster` : Provides cross dataset indexing for a group of user
defined genes.

### Demos
Expand Down
Loading