diff --git a/api/management/commands/acquiredata.py b/api/management/commands/acquiredata.py index 43e1543..d06ccb9 100644 --- a/api/management/commands/acquiredata.py +++ b/api/management/commands/acquiredata.py @@ -27,3 +27,13 @@ def handle(self, *args, **options): if not os.path.exists(sample_path): sample_url = 'https://raw.githubusercontent.com/cognoma/cancer-data/master/data/samples.tsv' urlretrieve(sample_url, sample_path) + + gene_path = os.path.join(options['path'], 'genes.tsv') + if not os.path.exists(gene_path): + gene_url = 'https://raw.githubusercontent.com/cognoma/genes/master/data/genes.tsv' + urlretrieve(gene_url, gene_path) + + mutation_path = os.path.join(options['path'], 'mutation-matrix.tsv.bz2') + if not os.path.exists(mutation_path): + mutation_url = 'https://ndownloader.figshare.com/files/5864862' + urlretrieve(mutation_url, mutation_path) diff --git a/api/management/commands/loaddata.py b/api/management/commands/loaddata.py index 57bf9ee..06c22ad 100644 --- a/api/management/commands/loaddata.py +++ b/api/management/commands/loaddata.py @@ -1,9 +1,10 @@ import os import csv +import bz2 from django.core.management.base import BaseCommand -from api.models import Disease, Sample +from api.models import Disease, Sample, Gene, Mutation class Command(BaseCommand): @@ -22,22 +23,66 @@ def handle(self, *args, **options): disease_path = os.path.join(options['path'], 'diseases.tsv') with open(disease_path) as disease_file: disease_reader = csv.DictReader(disease_file, delimiter='\t') + disease_list = [] for row in disease_reader: - Disease.objects.create( + disease = Disease( acronym=row['acronym'], name=row['disease'] ) + disease_list.append(disease) + Disease.objects.bulk_create(disease_list) # Samples if Sample.objects.count() == 0: sample_path = os.path.join(options['path'], 'samples.tsv') with open(sample_path) as sample_file: sample_reader = csv.DictReader(sample_file, delimiter='\t') + sample_list = [] for row in sample_reader: disease = Disease.objects.get(acronym=row['acronym']) - Sample.objects.create( + sample = Sample( sample_id=row['sample_id'], disease=disease, gender=row['gender'] or None, age_diagnosed=row['age_diagnosed'] or None ) + sample_list.append(sample) + Sample.objects.bulk_create(sample_list) + + # Genes + if Gene.objects.count() == 0: + gene_path = os.path.join(options['path'], 'genes.tsv') + with open(gene_path) as gene_file: + gene_reader = csv.DictReader(gene_file, delimiter='\t') + gene_list = [] + for row in gene_reader: + gene = Gene( + entrez_gene_id=row['entrez_gene_id'], + symbol=row['symbol'], + description=row['description'], + chromosome=row['chromosome'] or None, + gene_type=row['gene_type'], + synonyms=row['synonyms'].split('|') or None, + aliases=row['aliases'].split('|') or None + ) + gene_list.append(gene) + Gene.objects.bulk_create(gene_list) + + # Mutations + if Mutation.objects.count() == 0: + mutation_path = os.path.join(options['path'], 'mutation-matrix.tsv.bz2') + with bz2.open(mutation_path , 'rt') as mutation_file: + mutation_reader = csv.DictReader(mutation_file, delimiter='\t') + mutation_list = [] + for row in mutation_reader: + sample_id = row.pop('sample_id') + sample = Sample.objects.get(sample_id=sample_id) + for entrez_gene_id, mutation_status in row.items(): + if mutation_status == '1': + try: + gene = Gene.objects.get(entrez_gene_id=entrez_gene_id) + mutation = Mutation(gene=gene, sample=sample) + mutation_list.append(mutation) + except: + print('Had an issue inserting sample', sample_id, 'mutation', entrez_gene_id) + Mutation.objects.bulk_create(mutation_list) diff --git a/api/migrations/0003_genes_mutations.py b/api/migrations/0003_genes_mutations.py new file mode 100644 index 0000000..1103b66 --- /dev/null +++ b/api/migrations/0003_genes_mutations.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.8 on 2016-11-26 02:22 +from __future__ import unicode_literals + +import django.contrib.postgres.fields +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0002_alter_sample_fields'), + ] + + operations = [ + migrations.CreateModel( + name='Gene', + fields=[ + ('entrez_gene_id', models.IntegerField(primary_key=True, serialize=False)), + ('symbol', models.CharField(max_length=32)), + ('description', models.CharField(max_length=256)), + ('chromosome', models.CharField(max_length=8, null=True)), + ('gene_type', models.CharField(max_length=16)), + ('synonyms', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=32), null=True, size=None)), + ('aliases', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=256), null=True, size=None)), + ], + options={ + 'db_table': 'cognoma_genes', + }, + ), + migrations.RemoveField( + model_name='mutation', + name='status', + ), + migrations.AlterField( + model_name='classifier', + name='genes', + field=models.ManyToManyField(to='api.Gene'), + ), + migrations.AlterField( + model_name='mutation', + name='gene', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='mutations', to='api.Gene'), + ), + ] diff --git a/api/models.py b/api/models.py index f9e24bf..af25a00 100644 --- a/api/models.py +++ b/api/models.py @@ -3,7 +3,6 @@ from django.db import models from django.contrib.postgres import fields as postgresfields -from genes.models import Gene GENDER_CHOICES = ( ("male", "Male"), @@ -37,6 +36,18 @@ class Meta: gender = models.CharField(choices=GENDER_CHOICES, max_length=6, null=True) age_diagnosed = models.IntegerField(null=True, blank=False) +class Gene(models.Model): + class Meta: + db_table = "cognoma_genes" + + entrez_gene_id = models.IntegerField(primary_key=True) + symbol = models.CharField(max_length=32) + description = models.CharField(max_length=256) + chromosome = models.CharField(max_length=8, null=True) + gene_type = models.CharField(max_length=16) + synonyms = postgresfields.ArrayField(models.CharField(max_length=32), null=True) + aliases = postgresfields.ArrayField(models.CharField(max_length=256), null=True) + class Mutation(models.Model): class Meta: db_table = "mutations" @@ -44,7 +55,6 @@ class Meta: # id added by default gene = models.ForeignKey(Gene, related_name='mutations') sample = models.ForeignKey(Sample, related_name='mutations') - status = models.BooleanField() class Classifier(models.Model): class Meta: diff --git a/api/serializers.py b/api/serializers.py index 136fb08..a0a6cb3 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -5,8 +5,8 @@ from expander import ExpanderSerializerMixin from drf_dynamic_fields import DynamicFieldsMixin -from api.models import User, Classifier, Disease, Sample, Mutation -from genes.models import Gene, Organism +from api.models import User, Classifier, Disease, Sample, Mutation, Gene + class UserSerializer(DynamicFieldsMixin, serializers.Serializer): id = serializers.IntegerField(read_only=True) @@ -50,34 +50,24 @@ def to_representation(self, obj): return output -class OrganismSerializer(DynamicFieldsMixin, serializers.Serializer): - id = serializers.IntegerField() - taxonomy_id = serializers.IntegerField() - common_name = serializers.CharField() - scientific_name = serializers.CharField() - slug = serializers.CharField() - class MutationSerializer(DynamicFieldsMixin, ExpanderSerializerMixin, serializers.Serializer): id = serializers.IntegerField() gene = serializers.PrimaryKeyRelatedField(queryset=Gene.objects.all()) sample = serializers.PrimaryKeyRelatedField(queryset=Sample.objects.all()) - status = serializers.BooleanField() class GeneSerializer(DynamicFieldsMixin, ExpanderSerializerMixin, serializers.Serializer): class Meta: expandable_fields = { - 'organism': OrganismSerializer, 'mutations': (MutationSerializer, (), {'many': True}) } - id = serializers.IntegerField() - entrezid = serializers.IntegerField() - systematic_name = serializers.CharField() - standard_name = serializers.CharField() + entrez_gene_id = serializers.IntegerField() + symbol = serializers.CharField() description = serializers.CharField() - organism = OrganismSerializer() - aliases = serializers.CharField() - obsolete = serializers.BooleanField() + chromosome = serializers.CharField() + gene_type = serializers.CharField() + synonyms = serializers.ListField(child=serializers.CharField(allow_blank=True)) + aliases = serializers.ListField(child=serializers.CharField(allow_blank=True)) mutations = serializers.PrimaryKeyRelatedField(many=True, queryset=Mutation.objects.all()) class MutationSerializerMeta: @@ -156,4 +146,3 @@ class Meta: mutations = serializers.PrimaryKeyRelatedField(many=True, queryset=Mutation.objects.all()) gender = serializers.CharField() age_diagnosed = serializers.IntegerField() - diff --git a/api/test/test_classifiers.py b/api/test/test_classifiers.py index 3e9f7ca..b54c73f 100644 --- a/api/test/test_classifiers.py +++ b/api/test/test_classifiers.py @@ -1,7 +1,7 @@ from rest_framework.test import APITestCase, APIClient -from api.models import Disease -from genes.models import Gene, Organism +from api.models import Disease, Gene + class ClassifierTests(APITestCase): classifier_keys = ['id', @@ -22,30 +22,26 @@ def setUp(self): self.token = 'Bearer ' + self.user['random_slugs'][0] - self.human = Organism.objects.create(taxonomy_id=123, - common_name='human', - scientific_name='homo sapien', - slug='homo-sapien') - self.gene1 = Gene.objects.create(entrezid=123456, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) - self.gene2 = Gene.objects.create(entrezid=234567, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) + self.gene1 = Gene.objects.create(entrez_gene_id=123456, + symbol='GENE123', + description='foo', + chromosome='1', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) + self.gene2 = Gene.objects.create(entrez_gene_id=234567, + symbol='GENE234', + description='foo', + chromosome='X', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) self.disease1 = Disease.objects.create(acronym='BLCA', name='bladder urothelial carcinoma') self.disease2 = Disease.objects.create(acronym='GBM', name='glioblastoma multiforme') self.classifier_post_data = { - 'genes': [self.gene1.id, self.gene2.id], + 'genes': [self.gene1.entrez_gene_id, self.gene2.entrez_gene_id], 'diseases': [self.disease1.acronym, self.disease2.acronym] } @@ -228,4 +224,3 @@ def test_expansion(self): self.assertTrue(isinstance(list_response.data['results'][1]['genes'][1], dict)) self.assertTrue(isinstance(list_response.data['results'][1]['diseases'][0], dict)) self.assertTrue(isinstance(list_response.data['results'][1]['diseases'][1], dict)) - diff --git a/api/test/test_genes.py b/api/test/test_genes.py index da227b3..2c92a29 100644 --- a/api/test/test_genes.py +++ b/api/test/test_genes.py @@ -1,37 +1,32 @@ from rest_framework.test import APITestCase, APIClient -from genes.models import Gene, Organism +from api.models import Gene class GeneTests(APITestCase): - gene_keys = ['id', - 'entrezid', - 'systematic_name', - 'standard_name', + gene_keys = ['entrez_gene_id', + 'symbol', 'description', - 'organism', + 'chromosome', + 'gene_type', + 'synonyms', 'aliases', - 'obsolete', 'mutations'] def setUp(self): - self.human = Organism.objects.create(taxonomy_id=123, - common_name='human', - scientific_name='homo sapien', - slug='homo-sapien') - self.gene1 = Gene.objects.create(entrezid=123456, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) - self.gene2 = Gene.objects.create(entrezid=234567, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) + self.gene1 = Gene.objects.create(entrez_gene_id=123456, + symbol='GENE123', + description='foo', + chromosome='1', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) + self.gene2 = Gene.objects.create(entrez_gene_id=234567, + symbol='GENE234', + description='foo', + chromosome='X', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) def test_list_genes(self): client = APIClient() @@ -50,7 +45,7 @@ def test_list_genes(self): def test_get_gene(self): client = APIClient() - get_response = client.get('/genes/' + str(self.gene1.id)) + get_response = client.get('/genes/' + str(self.gene1.entrez_gene_id)) self.assertEqual(get_response.status_code, 200) self.assertEqual(list(get_response.data.keys()), self.gene_keys) @@ -58,7 +53,7 @@ def test_get_gene(self): def test_entrezid_filter(self): client = APIClient() - list_response = client.get('/genes?entrezid=123456') + list_response = client.get('/genes?entrez_gene_id=123456') self.assertEqual(list_response.status_code, 200) self.assertEqual(list(list_response.data.keys()), ['count', @@ -67,5 +62,4 @@ def test_entrezid_filter(self): 'results']) self.assertEqual(len(list_response.data['results']), 1) self.assertEqual(list(list_response.data['results'][0].keys()), self.gene_keys) - self.assertEqual(list_response.data['results'][0]['entrezid'], 123456) - + self.assertEqual(list_response.data['results'][0]['entrez_gene_id'], 123456) diff --git a/api/test/test_mutations.py b/api/test/test_mutations.py index eb9bf51..1c5dfd9 100644 --- a/api/test/test_mutations.py +++ b/api/test/test_mutations.py @@ -1,26 +1,20 @@ from rest_framework.test import APITestCase, APIClient -from api.models import Sample, Disease, Mutation -from genes.models import Gene, Organism +from api.models import Sample, Disease, Mutation, Gene class MutationTests(APITestCase): mutation_keys = ['id', 'gene', - 'sample', - 'status'] + 'sample'] def setUp(self): - self.human = Organism.objects.create(taxonomy_id=123, - common_name='human', - scientific_name='homo sapien', - slug='homo-sapien') - self.gene1 = Gene.objects.create(entrezid=123456, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) + self.gene1 = Gene.objects.create(entrez_gene_id=123456, + symbol='GENE123', + description='foo', + chromosome='1', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) self.disease1 = Disease.objects.create(acronym='BLCA', name='bladder urothelial carcinoma') self.sample1 = Sample.objects.create(sample_id='TCGA-22-4593-01', @@ -33,11 +27,9 @@ def setUp(self): age_diagnosed=43) self.mutation1 = Mutation.objects.create(gene=self.gene1, - sample=self.sample1, - status=True) + sample=self.sample1) self.mutation2 = Mutation.objects.create(gene=self.gene1, - sample=self.sample2, - status=True) + sample=self.sample2) def test_list_mutations(self): client = APIClient() @@ -60,4 +52,3 @@ def test_get_mutations(self): self.assertEqual(get_response.status_code, 200) self.assertEqual(list(get_response.data.keys()), self.mutation_keys) - diff --git a/api/test/test_organisms.py b/api/test/test_organisms.py deleted file mode 100644 index 3528828..0000000 --- a/api/test/test_organisms.py +++ /dev/null @@ -1,57 +0,0 @@ -from rest_framework.test import APITestCase, APIClient - -from genes.models import Gene, Organism - -class OrganismTests(APITestCase): - organism_keys = ['id', - 'taxonomy_id', - 'common_name', - 'scientific_name', - 'slug'] - - def setUp(self): - self.human = Organism.objects.create(taxonomy_id=123, - common_name='human', - scientific_name='homo sapien', - slug='homo-sapien') - self.awm33 = Organism.objects.create(taxonomy_id=234, - common_name='awm33', - scientific_name='homo githubien', - slug='homo-githubien') - - def test_list_organisms(self): - client = APIClient() - - list_response = client.get('/organisms') - - self.assertEqual(list_response.status_code, 200) - self.assertEqual(list(list_response.data.keys()), ['count', - 'next', - 'previous', - 'results']) - self.assertEqual(len(list_response.data['results']), 2) - self.assertEqual(list(list_response.data['results'][0].keys()), self.organism_keys) - self.assertEqual(list(list_response.data['results'][1].keys()), self.organism_keys) - - def test_get_organism(self): - client = APIClient() - - get_response = client.get('/organisms/' + str(self.human.id)) - - self.assertEqual(get_response.status_code, 200) - self.assertEqual(list(get_response.data.keys()), self.organism_keys) - - def test_taxonomy_id_filter(self): - client = APIClient() - - list_response = client.get('/organisms?taxonomy_id=234') - - self.assertEqual(list_response.status_code, 200) - self.assertEqual(list(list_response.data.keys()), ['count', - 'next', - 'previous', - 'results']) - self.assertEqual(len(list_response.data['results']), 1) - self.assertEqual(list(list_response.data['results'][0].keys()), self.organism_keys) - self.assertEqual(list_response.data['results'][0]['taxonomy_id'], 234) - diff --git a/api/test/test_sample.py b/api/test/test_sample.py index 463d085..6604855 100644 --- a/api/test/test_sample.py +++ b/api/test/test_sample.py @@ -1,7 +1,6 @@ from rest_framework.test import APITestCase, APIClient -from api.models import Sample, Disease, Mutation -from genes.models import Gene, Organism +from api.models import Sample, Disease, Mutation, Gene class SampleTests(APITestCase): sample_keys = ['sample_id', @@ -11,17 +10,13 @@ class SampleTests(APITestCase): 'age_diagnosed'] def setUp(self): - self.human = Organism.objects.create(taxonomy_id=123, - common_name='human', - scientific_name='homo sapien', - slug='homo-sapien') - self.gene1 = Gene.objects.create(entrezid=123456, - systematic_name='foo', - description='bar', - aliases='foo, bar', - obsolete=False, - weight=1.0, - organism_id=self.human.id) + self.gene1 = Gene.objects.create(entrez_gene_id=123456, + symbol='GENE123', + description='foo', + chromosome='1', + gene_type='bar', + synonyms=['foo', 'bar'], + aliases=['foo', 'bar']) self.disease1 = Disease.objects.create(acronym='BLCA', name='bladder urothelial carcinoma') self.sample1 = Sample.objects.create(sample_id='TCGA-22-4593-01', @@ -34,11 +29,9 @@ def setUp(self): age_diagnosed=43) self.mutation1 = Mutation.objects.create(gene=self.gene1, - sample=self.sample1, - status=True) + sample=self.sample1) self.mutation2 = Mutation.objects.create(gene=self.gene1, - sample=self.sample2, - status=True) + sample=self.sample2) def test_list_samples(self): client = APIClient() @@ -61,4 +54,3 @@ def test_get_sample(self): self.assertEqual(get_response.status_code, 200) self.assertEqual(list(get_response.data.keys()), self.sample_keys) - diff --git a/api/views.py b/api/views.py index 53e4da8..0d7c34e 100644 --- a/api/views.py +++ b/api/views.py @@ -2,8 +2,7 @@ from rest_framework import filters from rest_framework import generics -from api.models import User, Classifier, Disease, Sample, Mutation -from genes.models import Gene, Organism +from api.models import User, Classifier, Disease, Sample, Mutation, Gene from api import serializers from api.auth import UserUpdateSelfOnly, ClassifierPermission @@ -67,40 +66,20 @@ class UserRetrieveUpdate(generics.RetrieveUpdateAPIView): class GeneFilter(filters.FilterSet): class Meta: model = Gene - fields = ['entrezid', 'systematic_name', 'standard_name', 'aliases', 'obsolete'] + fields = ['entrez_gene_id', 'symbol', 'chromosome', 'gene_type'] class GeneList(generics.ListAPIView): queryset = Gene.objects.all() serializer_class = serializers.GeneSerializer filter_backends = (filters.DjangoFilterBackend,) filter_class = GeneFilter - ordering_fields = ('entrezid', 'systematic_name', 'standard_name') - ordering = ('id',) + ordering_fields = ('entrez_gene_id', 'symbol', 'chromosome') + ordering = ('entrez_gene_id',) class GeneRetrieve(generics.RetrieveAPIView): queryset = Gene.objects.all() serializer_class = serializers.GeneSerializer - lookup_field = 'id' - -# Organisms - -class OrganismFilter(filters.FilterSet): - class Meta: - model = Organism - fields = ['taxonomy_id', 'common_name', 'scientific_name', 'slug'] - -class OrganismList(generics.ListAPIView): - queryset = Organism.objects.all() - serializer_class = serializers.OrganismSerializer - filter_backends = (filters.DjangoFilterBackend,) - filter_class = OrganismFilter - ordering_fields = ('taxonomy_id', 'common_name', 'scientific_name') - ordering = ('id',) - -class OrganismRetrieve(generics.RetrieveAPIView): - queryset = Organism.objects.all() - serializer_class = serializers.OrganismSerializer - lookup_field = 'id' + lookup_field = 'entrez_gene_id' # Diseases @@ -127,14 +106,14 @@ class DiseaseRetrieve(generics.RetrieveAPIView): class MutationFilter(filters.FilterSet): class Meta: model = Mutation - fields = ['gene', 'sample', 'status'] + fields = ['gene', 'sample'] class MutationList(generics.ListAPIView): queryset = Mutation.objects.all() serializer_class = serializers.MutationSerializer filter_backends = (filters.DjangoFilterBackend,) filter_class = MutationFilter - ordering_fields = ('id', 'status',) + ordering_fields = ('id',) ordering = ('id',) class MutationRetrieve(generics.RetrieveAPIView): @@ -150,7 +129,7 @@ class SampleFilter(filters.FilterSet): class Meta: model = Sample - fields = ['sample_id', 'disease', 'gender', 'age_diagnosed', 'mutations__gene', 'mutations__gene__entrezid'] + fields = ['sample_id', 'disease', 'gender', 'age_diagnosed', 'mutations__gene', 'mutations__gene__entrez_gene_id'] class SampleList(generics.ListAPIView): queryset = Sample.objects.all() diff --git a/cognoma_site/urls.py b/cognoma_site/urls.py index 5c12e50..64b5eee 100644 --- a/cognoma_site/urls.py +++ b/cognoma_site/urls.py @@ -8,9 +8,7 @@ url(r'^users/?$', views.UserListCreate.as_view()), url(r'^users/(?P[0-9]+)$', views.UserRetrieveUpdate.as_view()), url(r'^genes/?$', views.GeneList.as_view()), - url(r'^genes/(?P[0-9]+)$', views.GeneRetrieve.as_view()), - url(r'^organisms/?$', views.OrganismList.as_view()), - url(r'^organisms/(?P[0-9]+)$', views.OrganismRetrieve.as_view()), + url(r'^genes/(?P[0-9]+)$', views.GeneRetrieve.as_view()), url(r'^diseases/?$', views.DiseaseList.as_view()), url(r'^diseases/(?P[a-zA-Z]+)$', views.DiseaseRetrieve.as_view()), url(r'^mutations/?$', views.MutationList.as_view()), diff --git a/requirements.txt b/requirements.txt index c070b1f..4df7c7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ pyasn1==0.1.9 pycparser==2.16 pycrypto==2.6.1 PyJWT==1.4.2 -six==1.10.0 +six==1.10.0 \ No newline at end of file