root/trunk/ProjectFortress/demos/BirdCount0.fss

Revision 4346, 14.2 KB (checked in by chf, 13 hours ago)

one step closer

Line 
1(*******************************************************************************
2Copyright 2009 Michael Zody and Sun Microsystems, Inc.
3All rights reserved.
4
5Sun Microsystems, Inc. is the Copyright owner of the Fortress programming language software, and Michael Zody is the developer of the algorithm which this software implements and the Copyright owner of the software implementation of the algorithm, to which Sun Microsystems, Inc. has a perpetual, irrevocable, royalty free right and license to use and distribute.
6
7Use is subject to license terms accompanying the software.
8
9 ******************************************************************************)
10
11component BirdCount0
12import File.{...}
13import FileSupport.{...}
14import FlatString.{...}
15import List.{...}
16import Map.{...}
17import Set.{...}
18import System.{getProperty}
19
20export Executable
21
22N:ZZ32 = 4
23
24conversion:Map[\String,ZZ32\] =
25    {[\String,ZZ32\]
26         "AA"   |->0  ,
27         "CA"   |->1  ,
28         "GA"   |->2  ,
29         "TA"   |->3  ,
30         "AC"   |->1  ,
31         "CC"   |->0  ,
32         "GC"   |->3  ,
33         "TC"   |->2  ,
34         "AG"   |->2  ,
35         "CG"   |->3  ,
36         "GG"   |->0  ,
37         "TG"   |->1  ,
38         "AT"   |->3  ,
39         "CT"   |->2  ,
40         "GT"   |->1  ,
41         "TT"   |->0  ,
42         "AN"  |->N  ,
43         "CN"  |->N  ,
44         "GN"  |->N  ,
45         "TN"  |->N  ,
46         "NA"  |->N  ,
47         "NC"  |->N  ,
48         "NG"  |->N  ,
49         "NT"  |->N  ,
50         "NN"  |->N 
51      }
52
53inverseConversion:Map[\String,Char\] =
54    {[\String, Char\]
55        "A0" |-> 'A',
56        "A1" |-> 'C',
57        "A2" |-> 'G',
58        "A3" |-> 'T',
59        "C0" |-> 'C',
60        "C1" |-> 'A',
61        "C2" |-> 'T',
62        "C3" |-> 'G',
63        "G0" |-> 'G',
64        "G1" |-> 'T',
65        "G2" |-> 'A',
66        "G3" |-> 'C',
67        "T0" |-> 'T',
68        "T1" |-> 'G',
69        "T2" |-> 'C',
70        "T3" |-> 'A'
71    }
72       
73
74validTransitions:Set[\String\] = {[\String\]
75    "0011","0022","0033","0110","0123","0132","0213","0220","0231","0312","0321","0330",
76    "1001","1023","1032","1100","1122","1133","1203","1221","1230","1302","1320","1331",
77    "2002","2013","2031","2103","2112","2130","2200","2211","2233","2301","2310","2332",
78    "3003","3012","3021","3102","3113","3120","3201","3210","3223","3300","3311","3322"
79}
80
81referenceChickenFile:String = getProperty("fortress.automhome",".") || "/ProjectFortress/demos/chickenData/chr10.fa.head"
82sampleFilesDir:String = getProperty("fortress.automhome",".") || "/ProjectFortress/demos/chickenData/test_data"
83sampleFilesFileName:String = "chr10.csfasta.ma.sorted.head"
84
85sampleChickens:List[\String\] = <|[\String\]
86    "10a_LA_Frag35_20080704_white_leghorn_A",
87    "10b_LA_Frag35_20080704_white_leghorn_B"
88(*    "17_LA_Frag35_20080909_high_grow_line",
89    "18_LA_Frag35_20080909_low_grow_line",
90    "17.1_LA_Frag35_20080925_high_grow_line",
91    "18.1_LA_Frag35_20080925_low_grow_line",
92    "16_LA_Frag35_20080829_broiler",
93    "22_LA_Frag35_20081016_RJF",
94    "16.1_LA_Frag35_20080925_broiler",
95    "22.1_LA_Frag35_20081114_RJF" *)
96|>
97
98convert(c:(Char, Char)):String = do
99    (c1:Char,c2:Char) = c
100   conversion.member("" || c1 || c2).get()
101end
102
103object Pairs[\T\](g: Generator[\T\]) extends Generator[\(T,T)\]
104    generate[\R\](red: Reduction[\R\], m:(T,T)->R): R =
105    if (l,v,r) <- g.generate[\Maybe[\(T,R,T)\]\](PairReduction[\T,R\](red,m),
106                             fn (t:T) => Just[\(T,R,T)\](t, red.empty(), t))
107        then v else red.empty() end
108end
109
110object PairReduction[\T,R\](red: Reduction[\R\], m:(T,T)->R)
111        extends Reduction[\Maybe[\(T,R,T)\]\]
112    empty():Maybe[\(T,R,T)\] = Nothing[\(T,R,T)\]
113    join(left:Maybe[\(T,R,T)\], right:Maybe[\(T,R,T)\]) : Maybe[\(T,R,T)\]=
114    if (l, v_l, m_l) <- left then
115        if (m_r, v_r, r) <- right then
116            Just[\(T,R,T)\](l, red.join(red.join(v_l, m(m_l, m_r)), v_r), r)
117        else left end
118    else right end
119end
120
121
122trait dna comprises {dnaSequence, EmptySequence}
123       getter colorSequence(): String
124       opr OPLUS(self, other: dna) : dna
125end
126
127
128(* left and right are ACGT encoded, middle is color encoded *)
129
130object dnaSequence(left:Char, middle:String, right:Char) extends dna
131   getter colorSequence():String = middle
132   opr OPLUS(self, other:dnaSequence):dnaSequence =
133       dnaSequence(left, middle || convert(right, other.left) || other.middle(), other.right)
134   opr OPLUS(self, other:EmptySequence) = self
135end
136
137object EmptySequence() extends dna
138    getter colorSequence():String = ""
139    opr OPLUS(self, other:dna) = other
140end
141
142object dnaSequenceReduction extends { MonoidReduction[\dna\],
143                               ReductionWithZeroes[\dna,dna\] }
144    getter asString() = "dnaSequenceReduction"
145    empty(): dna = EmptySequence()
146    join(a: dna, b: dna): dna = a OPLUS b
147end
148
149opr BIG OPLUS[\T\](): BigReduction[\dna, dna\] =
150   BigReduction[\dna,dna\](dnaSequenceReduction)
151
152opr BIG OPLUS[\T\](g:Generator[\dna\]) =
153   __bigOperatorSugar[\dna,dna,dna,dna\](BIG OPLUS[\T\](), g)
154
155
156processReferenceFileLine(input:String):dna = do
157    if input.get(0) = '>'
158        then EmptySequence()
159    else
160        size = |input|
161        println input " has " size " characters " 
162        dnaSequence(input.get(0), BIG || [p<- Pairs(input)](convert(p)), input.get(size-1))
163    end
164end
165
166processReferenceFile(name:String):String=do
167   var rs:FileReadStream = FileReadStream(name)
168   var res:dna  = BIG OPLUS [l<-rs.lines()] processReferenceFileLine(l)
169   colors = res.colorSequence()
170   println "result has " |colors| " characters which should be 1 less than input ACGT colors."
171   colors
172end
173
174readReferenceFile(name:String):String=do
175   var rs:FileReadStream = FileReadStream(name)
176   var res:String = BIG || [l<-rs.lines()] l
177   res
178end
179
180object Snip(header:String, sequence:String, name:String, pos:ZZ32, length:ZZ32, seqend:ZZ32,
181            referenceChickenColorsSnip:String, sampleChickenColorsSnip:String,
182            referenceChickenAGCTSnip:String)
183    getter header():String = header
184    getter sequence():String = sequence
185    getter name():String = name
186    getter pos():ZZ32 = pos
187    getter length():ZZ32 = length
188    getter seqend():ZZ32 = seqend
189    getter referenceChickenColorsSnip():String = referenceChickenColorsSnip
190    getter sampleChickenColorsSnip():String = sampleChickenColorsSnip
191    getter referenceChickenAGCTSnip():String = referenceChickenAGCTSnip   
192    getter asString():String = "\\n{Snip:" || name || "\\n" || referenceChickenColorsSnip || "\\n" sampleChickenColorsSnip || "\\n" || referenceChickenAGCTSnip
193
194    sampleChickenAGCTSnip():String = do
195
196        l:ZZ32 = |sampleChickenColorsSnip|
197        var i:ZZ32 := 0
198        var result:String := ""
199        var current:Char := sampleChickenColorsSnip.get(0)
200        var c:Char = referenceChickenAGCTSnip.get(0)
201
202        while (i < l) do
203          result := result || c
204          current := sampleChickenColorsSnip.get(i)
205          temp:String = c.asString() || current.asString()
206          c:= inverseConversion.member(temp).get
207          i:= i+1
208        end
209        result := result || c
210        result
211    end
212
213(*
214       BIG || [(x,y) <- referenceChickenAGCTSnip.zip[\Char\](referenceChickenColorsSnip.javaSubstr(0,1) || sampleChickenColorsSnip)] inverseConversion.member(x.asString() || y.asString()).get()
215
216*)
217
218    printSnip():() = do
219        println("Snip:" name)
220        println(" " referenceChickenColorsSnip)
221        println(" " sampleChickenColorsSnip)
222        println(referenceChickenAGCTSnip)
223(*        println(sampleChickenAGCTSnip())  *)
224    end
225
226
227end
228
229reverse(sequence:String):String = BIG || [c<-sequence.reverse()] c
230
231negativeOrientation(header:String):Boolean =  do
232   pattern:String = ">\\w*,\\d*_-.*"
233   header.javaRegExpMatches(pattern)
234end
235
236getAdjustedLocation(negativeOri:Boolean, location:String, length:ZZ32):ZZ32 = do
237   if (negativeOri) then
238      strToInt(location.asFlatString().javaSubstr(1), 10) - length + 1       
239   else
240      strToInt(location) - 1
241   end
242end
243
244getReferenceChickenColorsSnip(adjustedLoc:ZZ32, seqEnd:ZZ32):FlatString = do
245   referenceChickenColors[adjustedLoc:seqEnd+1].asFlatString()
246end
247
248getAdjustedSequence(negativeOri:Boolean, sequence:String, refChickenColorsSnip:FlatString):FlatString = do
249    temp:FlatString = sequence.asFlatString().javaSubstr(1)
250    tempLength:ZZ32 = |temp|
251    if (negativeOri) then
252        ((reverse(temp).asFlatString().javaSubstr(0, tempLength - 1 )).asFlatString() || refChickenColorsSnip.asFlatString().javaSubstr(tempLength - 1, tempLength).asFlatString()).asFlatString()
253    else
254        (refChickenColorsSnip.asFlatString().javaSubstr(0,1) || temp.asFlatString().javaSubstr(1).asFlatString()).asFlatString()
255    end
256end
257
258
259readASnip(r:ReadStream):Snip = do
260   header:String = r.uncheckedReadLine()
261   var sequence:String = r.uncheckedReadLine()
262
263   if sequence = "" then Snip(header, sequence,"", 0, 0, 0, "","","") else
264      name:String = header.asFlatString().javaRegExpSplit(",",0)
265      loc:String = header.asFlatString().javaRegExpSplit(",",1).asFlatString().javaRegExpSplit("_",1).asFlatString().javaRegExpSplit("\\.",0)
266      length:ZZ32 = sequence.asFlatString().size() - 1
267      negativeOri:Boolean = negativeOrientation(header)
268      adjustedLocation:ZZ32 = getAdjustedLocation(negativeOri, loc, length)
269      sequenceEnd:ZZ32 = adjustedLocation + length - 1
270      referenceChickenColorsSnip:FlatString = getReferenceChickenColorsSnip(adjustedLocation, sequenceEnd)
271      adjustedSequence:FlatString := getAdjustedSequence(negativeOri, sequence, referenceChickenColorsSnip)
272      referenceChickenAGCTSnip:String = referenceChickenAGCT.asFlatString().javaSubstr(adjustedLocation+6, sequenceEnd + 6 + 2)
273      Snip(header, adjustedSequence, name, adjustedLocation, length, sequenceEnd, referenceChickenColorsSnip, adjustedSequence, referenceChickenAGCTSnip)
274   end
275end
276
277
278ChunkSize:ZZ32 = 10000
279
280
281(* Given two samples, this creates a list of the positions where they differ *)
282
283object Event(s:Snip, diffs:String, startPos:ZZ32, endPos:ZZ32)
284    getter snip():Snip = s
285    getter startPos():ZZ32 = startPos
286    getter endPos():ZZ32 = endPos
287    getter diffs():String = diffs
288    getter asString():String = "{Event:" || s.header() || ":" startPos || "," || endPos || "}"
289    printEvent():() = do
290       println("Event:" )
291       s.printSnip()
292       println(" " diffs)
293       var i:ZZ32 := 0
294       var c:Char := s.referenceChickenAGCTSnip().get(startPos)
295       var result:String :=  c.asString()
296       while (i < endPos) do
297          next:Char = inverseConversion.member(c.asString() || s.sampleChickenColorsSnip.get(i+startPos).asString()).get()
298          result := result || next
299          c := next
300          i := i+1
301       end
302       println(s.referenceChickenAGCTSnip.javaSubstr(startPos, endPos+startPos+1))
303       println(result)
304    end
305end
306
307(* This code is a little obtuse.  The idea is that if the sample differs from the reference
308   chicken in more than one consecutive location it may be an interesting mutation.  We look
309   for these potential interesting mutations and return a list of them.
310 *)
311
312makeTuple(loc:ZZ32, diffs:String):(ZZ32,ZZ32) = do
313    temp:String = diffs.asFlatString().javaSubstr(loc).javaRegExpSplit("\\.",0)
314    (loc, |temp|)
315end
316
317makeEvent(s:Snip, diffs:String, loc:ZZ32):Event = do
318    temp:String = diffs.asFlatString().javaSubstr(loc).javaRegExpSplit("\\.",0)
319    Event(s, diffs, loc, |temp|)
320end   
321
322isInterestingPosition(loc:ZZ32, diffs:String):Boolean = do
323   if loc = 0 OR: diffs.get(loc) = '.'  OR: loc = (|diffs| - 1) OR: (diffs.get(loc - 1) = 'X') OR: (|diffs.asFlatString().javaSubstr(loc).javaRegExpSplit("\\.",0)| < 2) then
324      false
325   else true
326   end
327end
328
329isValidPairTransition(r:String, s:String) = do
330    (r || s) IN validTransitions
331end
332
333isValidTransition(reference:String, sample:String) = do
334    println("Reference = " reference)
335    println("sample = " sample)
336    var i:ZZ32 := 0
337    var result:Boolean := true
338    while (i < |reference| - 1) do
339        r = reference.javaSubstr(i, 2)
340        s = sample.javaSubstr(i, 2)
341        result := result AND isValidPairTransition(r,s)
342        i := i + 1
343    end
344    result
345end
346
347isValidTransition(s:Snip, loc:ZZ32, diffs:String):Boolean = do
348   len:ZZ32 = |diffs.asFlatString().javaSubstr(loc).javaRegExpSplit("\\.",0)|
349   sample:String = s.sampleChickenColorsSnip().javaSubstr(loc, loc+len)
350   reference:String = s.referenceChickenColorsSnip().javaSubstr(loc, loc+len)
351   isValidTransition(reference, sample)
352end
353
354isInteresting(s:Snip, loc:ZZ32, diffs:String):Boolean = do
355   isInterestingPosition(loc, diffs) AND: isValidTransition(s, loc,diffs)
356end
357
358
359
360
361sampleCompare(ref:String, sample:String):String = do
362   BIG || [(x,y) <- ref.zip[\Char\](sample)] (if (x =/= y) then "X" else "." end)
363end
364
365(* Given two samples this creates events *)
366
367eventGenerator(s:Snip):Map[\ZZ32, List[\(ZZ32,ZZ32)\]\] = do
368    diffs = sampleCompare(s.referenceChickenColorsSnip(),s.sampleChickenColorsSnip())
369    stop = |diffs| - 1
370    var database:Map[\ZZ32, List[\Event\]\] := {[\ZZ32, List[\Event\]\]}
371    for x <- 0:stop do
372        if isInteresting(s,x,diffs) then
373           database := database.add(x + s.pos(), <| makeEvent(s, diffs, x) |>)
374        end
375    end
376    database
377end
378
379combine(k:ZZ32, val1:List[\Event\], val2:List[\Event\]):List[\Event\] = val1 || val2
380
381processSampleChicken(name:String):Map[\ZZ32, List[\Event\]\] = do
382    max:ZZ32 = referenceChickenColors.size()
383    var rs:FileReadStream =
384        FileReadStream(sampleFilesDir || "/" || name || "/" || sampleFilesFileName)
385    fg:FileGenerator[\Snip\]  = FileGenerator[\Snip\](rs,60,readASnip)
386    var database:Map[\ZZ32, List[\Event\]\] := {[\ZZ32, List[\Event\]\]}
387
388    for s <- fg do
389       database := database.union(combine, eventGenerator(s))
390    end
391    database
392end
393
394var referenceChickenColors:String := ""
395var referenceChickenAGCT:String := ""
396
397run() = do
398    referenceChickenAGCT   := readReferenceFile(referenceChickenFile)
399    referenceChickenColors := processReferenceFile(referenceChickenFile)
400    var database:Map[\ZZ32, List[\Event\]\] := {[\ZZ32, List[\Event\]\]}
401   
402    for sampleChicken <- sampleChickens do
403        database := database.union(combine,processSampleChicken(sampleChicken))
404    end
405    println("Summary1")
406    for (key,entryList) <- seq(database) do
407       println(key)
408       for entry <- seq(entryList) do
409          entry.printEvent()
410       end
411    end
412
413end 
414end
415
416
Note: See TracBrowser for help on using the browser.