Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Acquires and loads gene and mutation data #42

Merged
merged 6 commits into from
Dec 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/management/commands/acquiredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,13 @@ def handle(self, *args, **options):
if not os.path.exists(sample_path):
sample_url = 'https://raw.githubusercontent.com/cognoma/cancer-data/master/data/samples.tsv'
urlretrieve(sample_url, sample_path)

gene_path = os.path.join(options['path'], 'genes.tsv')
if not os.path.exists(gene_path):
gene_url = 'https://raw.githubusercontent.com/cognoma/genes/master/data/genes.tsv'
urlretrieve(gene_url, gene_path)

mutation_path = os.path.join(options['path'], 'mutation-matrix.tsv.bz2')
if not os.path.exists(mutation_path):
mutation_url = 'https://ndownloader.figshare.com/files/5864862'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice if we could use the figshare download logic from cognoml. See cognoml/figshare.py. @jessept is our code modular enough that @stephenshank can use cognoml to download the figshare data here, or would this application be out of scope.

I created a corresponding issue for the cognoml team: cognoma/cognoml#15.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just assigned cognoma/cognoml#15 to myself to help move this forward. We can definitely use our data retrieval code in other places, @stephenshank check if the code here works for what you need. I'm happy to add any additional helper code as well, just let me know what you're looking for.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jessept -- nice. My main worry here is that hardcoding the URL for a specific dataset of a specific version of the figshare data is going to cause an upkeep issue later on. So the goal will be to use the cognoml logic for figshare downloads to avoid repeating any efforts and clean this up!

urlretrieve(mutation_url, mutation_path)
51 changes: 48 additions & 3 deletions api/management/commands/loaddata.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import csv
import bz2

from django.core.management.base import BaseCommand

from api.models import Disease, Sample
from api.models import Disease, Sample, Gene, Mutation


class Command(BaseCommand):
Expand All @@ -22,22 +23,66 @@ def handle(self, *args, **options):
disease_path = os.path.join(options['path'], 'diseases.tsv')
with open(disease_path) as disease_file:
disease_reader = csv.DictReader(disease_file, delimiter='\t')
disease_list = []
for row in disease_reader:
Disease.objects.create(
disease = Disease(
acronym=row['acronym'],
name=row['disease']
)
disease_list.append(disease)
Disease.objects.bulk_create(disease_list)

# Samples
if Sample.objects.count() == 0:
sample_path = os.path.join(options['path'], 'samples.tsv')
with open(sample_path) as sample_file:
sample_reader = csv.DictReader(sample_file, delimiter='\t')
sample_list = []
for row in sample_reader:
disease = Disease.objects.get(acronym=row['acronym'])
Sample.objects.create(
sample = Sample(
sample_id=row['sample_id'],
disease=disease,
gender=row['gender'] or None,
age_diagnosed=row['age_diagnosed'] or None
)
sample_list.append(sample)
Sample.objects.bulk_create(sample_list)

# Genes
if Gene.objects.count() == 0:
gene_path = os.path.join(options['path'], 'genes.tsv')
with open(gene_path) as gene_file:
gene_reader = csv.DictReader(gene_file, delimiter='\t')
gene_list = []
for row in gene_reader:
gene = Gene(
entrez_gene_id=row['entrez_gene_id'],
symbol=row['symbol'],
description=row['description'],
chromosome=row['chromosome'] or None,
gene_type=row['gene_type'],
synonyms=row['synonyms'].split('|') or None,
aliases=row['aliases'].split('|') or None
)
gene_list.append(gene)
Gene.objects.bulk_create(gene_list)

# Mutations
if Mutation.objects.count() == 0:
mutation_path = os.path.join(options['path'], 'mutation-matrix.tsv.bz2')
with bz2.open(mutation_path , 'rt') as mutation_file:
mutation_reader = csv.DictReader(mutation_file, delimiter='\t')
mutation_list = []
for row in mutation_reader:
sample_id = row.pop('sample_id')
sample = Sample.objects.get(sample_id=sample_id)
for entrez_gene_id, mutation_status in row.items():
if mutation_status == '1':
try:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does indeed catch a lone exception, even with the up-to-date genes.tsv file. The Entrez ID is 117153. Putting this into NCBI's Gene database shows that it is an out of date ID, and related to melanoma. The current ID is 4253, which is also found in mutation-matrix.tsv.bz2.

Copy link
Member

@dhimmel dhimmel Dec 8, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you using the latest figshare files (v5)?

From https://api.figshare.com/v2/articles/3487685:

[
   {
      "is_link_only":false,
      "size":173889264,
      "id":5864859,
      "download_url":"https://ndownloader.figshare.com/files/5864859",
      "name":"expression-matrix.tsv.bz2"
   },
   {
      "is_link_only":false,
      "size":1564703,
      "id":5864862,
      "download_url":"https://ndownloader.figshare.com/files/5864862",
      "name":"mutation-matrix.tsv.bz2"
   },
   {
      "is_link_only":false,
      "size":772313,
      "id":6207135,
      "download_url":"https://ndownloader.figshare.com/files/6207135",
      "name":"samples.tsv"
   },
   {
      "is_link_only":false,
      "size":1211305,
      "id":6207138,
      "download_url":"https://ndownloader.figshare.com/files/6207138",
      "name":"covariates.tsv"
   }
]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like you are. Will keep investingating

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you delete your local mutation-matrix.tsv.bz2, maybe it's outdated but since it exists is not re-downloading... that's all I can think of.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you delete your local mutation-matrix.tsv.bz2...

Just tried this, and the issue persists. ID 117153 (the lone exception) was discontinued on 9/10/16 and replaced with 4253. To investigate, I downloaded mutation-matrix.tsv.bz2 from the url in your JSON above. After loading with df=pandas.read_table(path, index_col=0), when I run '4253' in df.columns and '117153' in df.columns, both return True.

Looking at commit histories in cancer-data, the mutation matrix appears to have been made before this date. Looking at commits from genes, Entrez information was obtained after this date. My guess is that this ID changed between the creation of these two files.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, mutation-matrix.tsv.bz2 should filter out all genes that are not in cognoma/genes. I'll look into this, but for now keeping the error handling makes sense.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the event that this is useful, when I download the latest genes file, load with pandas, and run 4253 in df.entrez_gene_id I get True, but when I run 117153 in df.entrez_gene_id, I get False. I feel as though this further supports the idea that the discrepancy between these two files is due to the dates on which they were made, given that some information from Entrez changed in between.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into this @stephenshank. I reported the issue in cognoma/cancer-data#36.

gene = Gene.objects.get(entrez_gene_id=entrez_gene_id)
mutation = Mutation(gene=gene, sample=sample)
mutation_list.append(mutation)
except:
print('Had an issue inserting sample', sample_id, 'mutation', entrez_gene_id)
Mutation.objects.bulk_create(mutation_list)
46 changes: 46 additions & 0 deletions api/migrations/0003_genes_mutations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.8 on 2016-11-26 02:22
from __future__ import unicode_literals

import django.contrib.postgres.fields
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0002_alter_sample_fields'),
]

operations = [
migrations.CreateModel(
name='Gene',
fields=[
('entrez_gene_id', models.IntegerField(primary_key=True, serialize=False)),
('symbol', models.CharField(max_length=32)),
('description', models.CharField(max_length=256)),
('chromosome', models.CharField(max_length=8, null=True)),
('gene_type', models.CharField(max_length=16)),
('synonyms', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=32), null=True, size=None)),
('aliases', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=256), null=True, size=None)),
],
options={
'db_table': 'cognoma_genes',
},
),
migrations.RemoveField(
model_name='mutation',
name='status',
),
migrations.AlterField(
model_name='classifier',
name='genes',
field=models.ManyToManyField(to='api.Gene'),
),
migrations.AlterField(
model_name='mutation',
name='gene',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='mutations', to='api.Gene'),
),
]
14 changes: 12 additions & 2 deletions api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from django.db import models
from django.contrib.postgres import fields as postgresfields

from genes.models import Gene

GENDER_CHOICES = (
("male", "Male"),
Expand Down Expand Up @@ -37,14 +36,25 @@ class Meta:
gender = models.CharField(choices=GENDER_CHOICES, max_length=6, null=True)
age_diagnosed = models.IntegerField(null=True, blank=False)

class Gene(models.Model):
class Meta:
db_table = "cognoma_genes"

entrez_gene_id = models.IntegerField(primary_key=True)
symbol = models.CharField(max_length=32)
description = models.CharField(max_length=256)
chromosome = models.CharField(max_length=8, null=True)
gene_type = models.CharField(max_length=16)
synonyms = postgresfields.ArrayField(models.CharField(max_length=32), null=True)
aliases = postgresfields.ArrayField(models.CharField(max_length=256), null=True)

class Mutation(models.Model):
class Meta:
db_table = "mutations"

# id added by default
gene = models.ForeignKey(Gene, related_name='mutations')
sample = models.ForeignKey(Sample, related_name='mutations')
status = models.BooleanField()

class Classifier(models.Model):
class Meta:
Expand Down
27 changes: 8 additions & 19 deletions api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from expander import ExpanderSerializerMixin
from drf_dynamic_fields import DynamicFieldsMixin

from api.models import User, Classifier, Disease, Sample, Mutation
from genes.models import Gene, Organism
from api.models import User, Classifier, Disease, Sample, Mutation, Gene


class UserSerializer(DynamicFieldsMixin, serializers.Serializer):
id = serializers.IntegerField(read_only=True)
Expand Down Expand Up @@ -50,34 +50,24 @@ def to_representation(self, obj):

return output

class OrganismSerializer(DynamicFieldsMixin, serializers.Serializer):
id = serializers.IntegerField()
taxonomy_id = serializers.IntegerField()
common_name = serializers.CharField()
scientific_name = serializers.CharField()
slug = serializers.CharField()

class MutationSerializer(DynamicFieldsMixin, ExpanderSerializerMixin, serializers.Serializer):
id = serializers.IntegerField()
gene = serializers.PrimaryKeyRelatedField(queryset=Gene.objects.all())
sample = serializers.PrimaryKeyRelatedField(queryset=Sample.objects.all())
status = serializers.BooleanField()

class GeneSerializer(DynamicFieldsMixin, ExpanderSerializerMixin, serializers.Serializer):
class Meta:
expandable_fields = {
'organism': OrganismSerializer,
'mutations': (MutationSerializer, (), {'many': True})
}

id = serializers.IntegerField()
entrezid = serializers.IntegerField()
systematic_name = serializers.CharField()
standard_name = serializers.CharField()
entrez_gene_id = serializers.IntegerField()
symbol = serializers.CharField()
description = serializers.CharField()
organism = OrganismSerializer()
aliases = serializers.CharField()
obsolete = serializers.BooleanField()
chromosome = serializers.CharField()
gene_type = serializers.CharField()
synonyms = serializers.ListField(child=serializers.CharField(allow_blank=True))
aliases = serializers.ListField(child=serializers.CharField(allow_blank=True))
mutations = serializers.PrimaryKeyRelatedField(many=True, queryset=Mutation.objects.all())

class MutationSerializerMeta:
Expand Down Expand Up @@ -156,4 +146,3 @@ class Meta:
mutations = serializers.PrimaryKeyRelatedField(many=True, queryset=Mutation.objects.all())
gender = serializers.CharField()
age_diagnosed = serializers.IntegerField()

39 changes: 17 additions & 22 deletions api/test/test_classifiers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from rest_framework.test import APITestCase, APIClient

from api.models import Disease
from genes.models import Gene, Organism
from api.models import Disease, Gene


class ClassifierTests(APITestCase):
classifier_keys = ['id',
Expand All @@ -22,30 +22,26 @@ def setUp(self):

self.token = 'Bearer ' + self.user['random_slugs'][0]

self.human = Organism.objects.create(taxonomy_id=123,
common_name='human',
scientific_name='homo sapien',
slug='homo-sapien')
self.gene1 = Gene.objects.create(entrezid=123456,
systematic_name='foo',
description='bar',
aliases='foo, bar',
obsolete=False,
weight=1.0,
organism_id=self.human.id)
self.gene2 = Gene.objects.create(entrezid=234567,
systematic_name='foo',
description='bar',
aliases='foo, bar',
obsolete=False,
weight=1.0,
organism_id=self.human.id)
self.gene1 = Gene.objects.create(entrez_gene_id=123456,
symbol='GENE123',
description='foo',
chromosome='1',
gene_type='bar',
synonyms=['foo', 'bar'],
aliases=['foo', 'bar'])
self.gene2 = Gene.objects.create(entrez_gene_id=234567,
symbol='GENE234',
description='foo',
chromosome='X',
gene_type='bar',
synonyms=['foo', 'bar'],
aliases=['foo', 'bar'])
self.disease1 = Disease.objects.create(acronym='BLCA',
name='bladder urothelial carcinoma')
self.disease2 = Disease.objects.create(acronym='GBM',
name='glioblastoma multiforme')
self.classifier_post_data = {
'genes': [self.gene1.id, self.gene2.id],
'genes': [self.gene1.entrez_gene_id, self.gene2.entrez_gene_id],
'diseases': [self.disease1.acronym, self.disease2.acronym]
}

Expand Down Expand Up @@ -228,4 +224,3 @@ def test_expansion(self):
self.assertTrue(isinstance(list_response.data['results'][1]['genes'][1], dict))
self.assertTrue(isinstance(list_response.data['results'][1]['diseases'][0], dict))
self.assertTrue(isinstance(list_response.data['results'][1]['diseases'][1], dict))

52 changes: 23 additions & 29 deletions api/test/test_genes.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,32 @@
from rest_framework.test import APITestCase, APIClient

from genes.models import Gene, Organism
from api.models import Gene

class GeneTests(APITestCase):
gene_keys = ['id',
'entrezid',
'systematic_name',
'standard_name',
gene_keys = ['entrez_gene_id',
'symbol',
'description',
'organism',
'chromosome',
'gene_type',
'synonyms',
'aliases',
'obsolete',
'mutations']

def setUp(self):
self.human = Organism.objects.create(taxonomy_id=123,
common_name='human',
scientific_name='homo sapien',
slug='homo-sapien')
self.gene1 = Gene.objects.create(entrezid=123456,
systematic_name='foo',
description='bar',
aliases='foo, bar',
obsolete=False,
weight=1.0,
organism_id=self.human.id)
self.gene2 = Gene.objects.create(entrezid=234567,
systematic_name='foo',
description='bar',
aliases='foo, bar',
obsolete=False,
weight=1.0,
organism_id=self.human.id)
self.gene1 = Gene.objects.create(entrez_gene_id=123456,
symbol='GENE123',
description='foo',
chromosome='1',
gene_type='bar',
synonyms=['foo', 'bar'],
aliases=['foo', 'bar'])
self.gene2 = Gene.objects.create(entrez_gene_id=234567,
symbol='GENE234',
description='foo',
chromosome='X',
gene_type='bar',
synonyms=['foo', 'bar'],
aliases=['foo', 'bar'])

def test_list_genes(self):
client = APIClient()
Expand All @@ -50,15 +45,15 @@ def test_list_genes(self):
def test_get_gene(self):
client = APIClient()

get_response = client.get('/genes/' + str(self.gene1.id))
get_response = client.get('/genes/' + str(self.gene1.entrez_gene_id))

self.assertEqual(get_response.status_code, 200)
self.assertEqual(list(get_response.data.keys()), self.gene_keys)

def test_entrezid_filter(self):
client = APIClient()

list_response = client.get('/genes?entrezid=123456')
list_response = client.get('/genes?entrez_gene_id=123456')

self.assertEqual(list_response.status_code, 200)
self.assertEqual(list(list_response.data.keys()), ['count',
Expand All @@ -67,5 +62,4 @@ def test_entrezid_filter(self):
'results'])
self.assertEqual(len(list_response.data['results']), 1)
self.assertEqual(list(list_response.data['results'][0].keys()), self.gene_keys)
self.assertEqual(list_response.data['results'][0]['entrezid'], 123456)

self.assertEqual(list_response.data['results'][0]['entrez_gene_id'], 123456)
Loading