Skip to content

Commit

Permalink
codecreatede
Browse files Browse the repository at this point in the history
finished and binary release. Reduced the code and also it writes  a SAM compatible file. writing a BTree for the same and then based on that use the DFS to search for the linked Kmer.
  • Loading branch information
applicativesystem committed Oct 23, 2024
1 parent a73e40a commit f944d4b
Show file tree
Hide file tree
Showing 11 changed files with 998 additions and 73,532 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,15 @@ exit status 1
```

- it will produce a SAM compatible format which will give the start, end, kmer, origin of the kmer and all the other details.
- writing a interconvertible which will convert this into a hash Btree.

```
0 5 ATACT ATACTTTAAATTTTAGTTACTATTAT
1 6 TACTT ATACTTTAAATTTTAGTTACTATTAT
2 7 ACTTT ATACTTTAAATTTTAGTTACTATTAT
3 8 CTTTA ATACTTTAAATTTTAGTTACTATTAT
4 9 TTTAA ATACTTTAAATTTTAGTTACTATTAT
```

Gaurav Sablok
86 changes: 86 additions & 0 deletions genomehashes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
0 5 ATACT ATACTTTAAATTTTAGTTACTATTAT
1 6 TACTT ATACTTTAAATTTTAGTTACTATTAT
2 7 ACTTT ATACTTTAAATTTTAGTTACTATTAT
3 8 CTTTA ATACTTTAAATTTTAGTTACTATTAT
4 9 TTTAA ATACTTTAAATTTTAGTTACTATTAT
5 10 TTAAA ATACTTTAAATTTTAGTTACTATTAT
6 11 TAAAT ATACTTTAAATTTTAGTTACTATTAT
7 12 AAATT ATACTTTAAATTTTAGTTACTATTAT
8 13 AATTT ATACTTTAAATTTTAGTTACTATTAT
9 14 ATTTT ATACTTTAAATTTTAGTTACTATTAT
10 15 TTTTA ATACTTTAAATTTTAGTTACTATTAT
11 16 TTTAG ATACTTTAAATTTTAGTTACTATTAT
12 17 TTAGT ATACTTTAAATTTTAGTTACTATTAT
13 18 TAGTT ATACTTTAAATTTTAGTTACTATTAT
14 19 AGTTA ATACTTTAAATTTTAGTTACTATTAT
15 20 GTTAC ATACTTTAAATTTTAGTTACTATTAT
16 21 TTACT ATACTTTAAATTTTAGTTACTATTAT
17 22 TACTA ATACTTTAAATTTTAGTTACTATTAT
18 23 ACTAT ATACTTTAAATTTTAGTTACTATTAT
19 24 CTATT ATACTTTAAATTTTAGTTACTATTAT
20 25 TATTA ATACTTTAAATTTTAGTTACTATTAT
21 26 ATTAT ATACTTTAAATTTTAGTTACTATTAT
0 5 ATACT ATACTTTAAATTTTAGTTACTATTAT
1 6 TACTT ATACTTTAAATTTTAGTTACTATTAT
2 7 ACTTT ATACTTTAAATTTTAGTTACTATTAT
3 8 CTTTA ATACTTTAAATTTTAGTTACTATTAT
4 9 TTTAA ATACTTTAAATTTTAGTTACTATTAT
5 10 TTAAA ATACTTTAAATTTTAGTTACTATTAT
6 11 TAAAT ATACTTTAAATTTTAGTTACTATTAT
7 12 AAATT ATACTTTAAATTTTAGTTACTATTAT
8 13 AATTT ATACTTTAAATTTTAGTTACTATTAT
9 14 ATTTT ATACTTTAAATTTTAGTTACTATTAT
10 15 TTTTA ATACTTTAAATTTTAGTTACTATTAT
11 16 TTTAG ATACTTTAAATTTTAGTTACTATTAT
12 17 TTAGT ATACTTTAAATTTTAGTTACTATTAT
13 18 TAGTT ATACTTTAAATTTTAGTTACTATTAT
14 19 AGTTA ATACTTTAAATTTTAGTTACTATTAT
15 20 GTTAC ATACTTTAAATTTTAGTTACTATTAT
16 21 TTACT ATACTTTAAATTTTAGTTACTATTAT
17 22 TACTA ATACTTTAAATTTTAGTTACTATTAT
18 23 ACTAT ATACTTTAAATTTTAGTTACTATTAT
19 24 CTATT ATACTTTAAATTTTAGTTACTATTAT
20 25 TATTA ATACTTTAAATTTTAGTTACTATTAT
0 5 ATACT ATACTTTAAATTTTAGTTACTATTAT
1 6 TACTT ATACTTTAAATTTTAGTTACTATTAT
2 7 ACTTT ATACTTTAAATTTTAGTTACTATTAT
3 8 CTTTA ATACTTTAAATTTTAGTTACTATTAT
4 9 TTTAA ATACTTTAAATTTTAGTTACTATTAT
5 10 TTAAA ATACTTTAAATTTTAGTTACTATTAT
6 11 TAAAT ATACTTTAAATTTTAGTTACTATTAT
7 12 AAATT ATACTTTAAATTTTAGTTACTATTAT
8 13 AATTT ATACTTTAAATTTTAGTTACTATTAT
9 14 ATTTT ATACTTTAAATTTTAGTTACTATTAT
10 15 TTTTA ATACTTTAAATTTTAGTTACTATTAT
11 16 TTTAG ATACTTTAAATTTTAGTTACTATTAT
12 17 TTAGT ATACTTTAAATTTTAGTTACTATTAT
13 18 TAGTT ATACTTTAAATTTTAGTTACTATTAT
14 19 AGTTA ATACTTTAAATTTTAGTTACTATTAT
15 20 GTTAC ATACTTTAAATTTTAGTTACTATTAT
16 21 TTACT ATACTTTAAATTTTAGTTACTATTAT
17 22 TACTA ATACTTTAAATTTTAGTTACTATTAT
18 23 ACTAT ATACTTTAAATTTTAGTTACTATTAT
19 24 CTATT ATACTTTAAATTTTAGTTACTATTAT
20 25 TATTA ATACTTTAAATTTTAGTTACTATTAT
21 26 ATTAT ATACTTTAAATTTTAGTTACTATTAT
0 5 ATACT ATACTTTAAATTTTAGTTACTATTAT
1 6 TACTT ATACTTTAAATTTTAGTTACTATTAT
2 7 ACTTT ATACTTTAAATTTTAGTTACTATTAT
3 8 CTTTA ATACTTTAAATTTTAGTTACTATTAT
4 9 TTTAA ATACTTTAAATTTTAGTTACTATTAT
5 10 TTAAA ATACTTTAAATTTTAGTTACTATTAT
6 11 TAAAT ATACTTTAAATTTTAGTTACTATTAT
7 12 AAATT ATACTTTAAATTTTAGTTACTATTAT
8 13 AATTT ATACTTTAAATTTTAGTTACTATTAT
9 14 ATTTT ATACTTTAAATTTTAGTTACTATTAT
10 15 TTTTA ATACTTTAAATTTTAGTTACTATTAT
11 16 TTTAG ATACTTTAAATTTTAGTTACTATTAT
12 17 TTAGT ATACTTTAAATTTTAGTTACTATTAT
13 18 TAGTT ATACTTTAAATTTTAGTTACTATTAT
14 19 AGTTA ATACTTTAAATTTTAGTTACTATTAT
15 20 GTTAC ATACTTTAAATTTTAGTTACTATTAT
16 21 TTACT ATACTTTAAATTTTAGTTACTATTAT
17 22 TACTA ATACTTTAAATTTTAGTTACTATTAT
18 23 ACTAT ATACTTTAAATTTTAGTTACTATTAT
19 24 CTATT ATACTTTAAATTTTAGTTACTATTAT
20 25 TATTA ATACTTTAAATTTTAGTTACTATTAT
Binary file added gonuciter
Binary file not shown.
Binary file added gonuciter.tar
Binary file not shown.
740 changes: 740 additions & 0 deletions illuminahashes.txt

Large diffs are not rendered by default.

157 changes: 59 additions & 98 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,6 @@ func pacbioFunc(cmd *cobra.Command, args []string) {
pacbioIDConstruct := []pacbiofileID{}
pacbioSeqConstruct := []pacbiofileSeq{}

// reading and storing the pacbio file

fpacbio, err := os.Open(pacbiofile)
if err != nil {
log.Fatal(err)
Expand Down Expand Up @@ -133,44 +131,38 @@ func pacbioFunc(cmd *cobra.Command, args []string) {

pacbioHashes := []string{}

for i := 0; i <= len(pacbioIsolate); i++ {
for i := 0; i <= len(pacbioIsolate)-1; i++ {
for j := 0; j <= len(pacbioIsolate[i])-kmerArgs; j++ {
pacbioHashes = append(pacbioHashes, pacbioIsolate[i][j:j+kmerArgs])
}
}

uniquePacbio, _ := uniqueHash(pacbioIsolate)

pacbioFile, err := os.Create("pacbiohashes.txt")
if err != nil {
log.Fatal(err)
defer pacbioFile.Close()
}
for i := range uniquePacbio {
pacbioFile.WriteString(uniquePacbio[i] + "\n")
}

pacbioIndexStart := []int{}
pacbioIndexEnd := []int{}
captureUnique := []string{}

for i := range uniquePacbio {
for j := range pacbioIsolate {
start := strings.Index(pacbioIsolate[j], uniquePacbio[i])
end := start + len(uniquePacbio[i])
pacbioIndexStart = append(pacbioIndexStart, start)
pacbioIndexEnd = append(pacbioIndexEnd, end)
for i := 0; i <= len(pacbioHashes)-2; i++ {
if pacbioHashes[i] == pacbioHashes[i+1] {
continue
} else {
captureUnique = append(captureUnique, pacbioHashes[i])
}
}

pacbioWrite, err := os.Create("pacbioKmerOrigin.txt")
pacbioFile, err := os.Create("pacbiohashes.txt")
if err != nil {
log.Fatal(err)
defer pacbioFile.Close()
}

for i := range pacbioIndexStart {
start := strconv.Itoa(pacbioIndexStart[i])
end := strconv.Itoa(pacbioIndexEnd[i])
pacbioWrite.WriteString(start + "\t" + end + "\t" + uniquePacbio[i])
for i := 0; i <= len(pacbioIsolate)-1; i++ {
for j := 0; j <= len(captureUnique)-1; j++ {
pacbioFile.WriteString(
strconv.Itoa(
strings.Index(pacbioIsolate[i], captureUnique[j]),
) + "\t" + strconv.Itoa(
strings.Index(pacbioIsolate[i], captureUnique[j])+len(captureUnique[j]),
) +
"\t" + captureUnique[j] + "\t" + pacbioIsolate[i] + "\n",
)
}
}
}

Expand Down Expand Up @@ -214,44 +206,38 @@ func genomeFunc(cmd *cobra.Command, args []string) {
}

genomeHashes := []string{}
for i := 0; i <= len(genomeIsolate); i++ {
for i := 0; i <= len(genomeIsolate)-1; i++ {
for j := 0; j <= len(genomeIsolate[i])-kmerArgs; j++ {
genomeHashes = append(genomeHashes, genomeIsolate[i][j:j+kmerArgs])
}
}

uniqueGenome, _ := uniqueHash(genomeIsolate)

genomeFile, err := os.Create("pacbiohashes.txt")
if err != nil {
log.Fatal(err)
defer genomeFile.Close()
}
for i := range uniqueGenome {
genomeFile.WriteString(uniqueGenome[i] + "\n")
}

genomeIndexStart := []int{}
genomeIndexEnd := []int{}
captureUnique := []string{}

for i := range uniqueGenome {
for j := range genomeIsolate {
start := strings.Index(genomeIsolate[j], uniqueGenome[i])
end := start + len(uniqueGenome[i])
genomeIndexStart = append(genomeIndexStart, start)
genomeIndexEnd = append(genomeIndexEnd, end)
for i := 0; i <= len(genomeHashes)-2; i++ {
if genomeHashes[i] == genomeHashes[i+1] {
continue
} else {
captureUnique = append(captureUnique, genomeHashes[i])
}
}

genomeWrite, err := os.Create("genomeKmerOrigin.txt")
genomeFile, err := os.Create("genomehashes.txt")
if err != nil {
log.Fatal(err)
defer genomeFile.Close()
}

for i := range genomeIndexStart {
start := strconv.Itoa(genomeIndexStart[i])
end := strconv.Itoa(genomeIndexEnd[i])
genomeWrite.WriteString(start + "\t" + end + "\t" + uniqueGenome[i])
for i := 0; i <= len(genomeIsolate)-1; i++ {
for j := 0; j <= len(captureUnique)-1; j++ {
genomeFile.WriteString(
strconv.Itoa(
strings.Index(genomeIsolate[i], captureUnique[j]),
) + "\t" + strconv.Itoa(
strings.Index(genomeIsolate[i], captureUnique[j])+len(captureUnique[j]),
) +
"\t" + captureUnique[j] + "\t" + genomeIsolate[i] + "\n",
)
}
}
}

Expand Down Expand Up @@ -295,62 +281,37 @@ func illuminaFunc(cmd *cobra.Command, args []string) {
}

illuminaHashes := []string{}

for i := 0; i <= len(illuminaIsolate); i++ {
for i := 0; i <= len(illuminaIsolate)-1; i++ {
for j := 0; j <= len(illuminaIsolate[i])-kmerArgs; j++ {
illuminaHashes = append(illuminaHashes, illuminaIsolate[i][j:j+kmerArgs])
}
}

uniqueillumina, _ := uniqueHash(illuminaIsolate)

illuminaFile, err := os.Create("pacbiohashes.txt")
if err != nil {
log.Fatal(err)
defer illuminaFile.Close()
}
for i := range uniqueillumina {
illuminaFile.WriteString(uniqueillumina[i] + "\n")
}

illuminaIndexStart := []int{}
illuminaIndexEnd := []int{}
captureUnique := []string{}

for i := range uniqueillumina {
for j := range illuminaIsolate {
start := strings.Index(illuminaIsolate[j], uniqueillumina[i])
end := start + len(uniqueillumina[i])
illuminaIndexStart = append(illuminaIndexStart, start)
illuminaIndexEnd = append(illuminaIndexEnd, end)
for i := 0; i <= len(illuminaHashes)-2; i++ {
if illuminaHashes[i] == illuminaHashes[i+1] {
continue
} else {
captureUnique = append(captureUnique, illuminaHashes[i])
}
}

illuminaWrite, err := os.Create("illuminaKmerOrigin.txt")
illuminaFile, err := os.Create("illuminahashes.txt")
if err != nil {
log.Fatal(err)
defer illuminaFile.Close()
}

for i := range illuminaIndexStart {
start := strconv.Itoa(illuminaIndexStart[i])
end := strconv.Itoa(illuminaIndexEnd[i])
illuminaWrite.WriteString(start + "\t" + end + "\t" + uniqueillumina[i])
}
}

// golang has no unique implementation, so a additional unqiue function to make the hashes compare to each other.
func uniqueHash(inputvar []string) ([]string, int) {
captureHash := inputvar
captureUnique := []string{}
var captureLength int

for i := 0; i <= len(captureHash)-1; i++ {
if captureHash[i] == captureHash[i+1] {
continue
} else {
captureUnique = append(captureUnique, captureHash[i])
for i := 0; i <= len(illuminaIsolate)-1; i++ {
for j := 0; j <= len(captureUnique)-1; j++ {
illuminaFile.WriteString(
strconv.Itoa(
strings.Index(illuminaIsolate[i], captureUnique[j]),
) + "\t" + strconv.Itoa(
strings.Index(illuminaIsolate[i], captureUnique[j])+len(captureUnique[j]),
) +
"\t" + captureUnique[j] + "\t" + illuminaIsolate[i] + "\n",
)
}
}
captureLength += len(captureUnique)

return captureUnique, captureLength
}
Loading

0 comments on commit f944d4b

Please sign in to comment.