Skip to content

Commit 7120503

Browse files
author
Hernan Morales
committed
Use SmaCC for PHYLIP parsing
1 parent b004ea6 commit 7120503

7 files changed

Lines changed: 281 additions & 13 deletions

repository/BioParsers-Tests/BioPhylipParserTest.class.st

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ Class {
77

88
{ #category : 'testing' }
99
BioPhylipParserTest >> firstLineTokenizer [
10-
11-
^ BioPhylipParser new firstLineTokenizer
10+
^ BioPhylipSmaCCParser new firstLineTokenizer
1211
]
1312

1413
{ #category : 'testing' }
@@ -71,13 +70,13 @@ TSIDVIHSFT ISTLGIKIDC IPGRCN
7170
{ #category : 'testing' }
7271
BioPhylipParserTest >> speciesDNALineTokenizer [
7372

74-
^ BioPhylipParser new speciesDNALineTokenizer
73+
^ BioPhylipSmaCCParser new speciesDNALineTokenizer
7574
]
7675

7776
{ #category : 'testing' }
7877
BioPhylipParserTest >> speciesDNANamedBlockTokenizer [
7978

80-
^ BioPhylipParser new speciesDNANamedBlockTokenizer
79+
^ BioPhylipSmaCCParser new speciesDNANamedBlockTokenizer
8180
]
8281

8382
{ #category : 'testing' }

repository/BioParsers-Tests/BioProteinParserTest.class.st

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Class {
55
#package : 'BioParsers-Tests'
66
}
77

8-
{ #category : 'accessing' }
8+
{ #category : 'running' }
99
BioProteinParserTest >> setUp [
1010

1111
super setUp.

repository/BioParsers/BioParser.class.st

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -370,16 +370,16 @@ BioParser class >> tokenizeNcbiXmlEntrezSearchResult: aString nodes: aCollection
370370

371371
{ #category : 'parse-phylip' }
372372
BioParser class >> tokenizePhylipInterleavedDNA: aPhylipString [
373-
" Parse aPhylipString and answer a BioPhylip object with contents filled "
374-
375-
^ ( BioPhylipParser on: aPhylipString ) tokenizeInterleavedDNA
373+
| parser |
374+
parser := BioPhylipSmaCCParser on: aPhylipString readStream.
375+
^ parser parseInterleaved
376376
]
377377

378378
{ #category : 'parse-phylip' }
379379
BioParser class >> tokenizePhylipInterleavedProtein: aPhylipString [
380-
" Parse aPhylipString and answer a BioPhylip object with contents filled "
381-
382-
^ ( BioPhylipParser on: aPhylipString ) tokenizeInterleavedProtein
380+
| parser |
381+
parser := BioPhylipSmaCCParser on: aPhylipString readStream.
382+
^ parser parseInterleaved
383383
]
384384

385385
{ #category : 'parse-ncbi-identifiers' }

repository/BioParsers/BioPhylipParser.class.st

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@ Class {
1717
#tag : 'PHYLIP'
1818
}
1919

20-
{ #category : 'as yet unclassified' }
20+
{ #category : 'parsing' }
2121
BioPhylipParser class >> parseFile: aFileReference [
2222
^ self new parseFile: aFileReference
2323
]
2424

25-
{ #category : 'as yet unclassified' }
25+
{ #category : 'parsing' }
2626
BioPhylipParser class >> parseString: aString [
2727
^ self new parseString: aString
2828
]
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
Class {
2+
#name : 'BioPhylipScanner',
3+
#superclass : 'SmaCCScanner',
4+
#category : 'BioParsers-PHYLIP',
5+
#package : 'BioParsers',
6+
#tag : 'PHYLIP'
7+
}
8+
9+
{ #category : 'instance creation' }
10+
BioPhylipScanner class >> on: aStream [
11+
^ self new stream: aStream; yourself
12+
]
13+
14+
{ #category : 'private' }
15+
BioPhylipScanner >> scanNameOrSequence [
16+
| start content |
17+
start := stream position.
18+
[ stream atEnd not and: [
19+
| c |
20+
c := stream peek.
21+
c isLetter or: [ c isDigit or: [ c = $. or: [ c = $- or: [ c = $? or: [ c = $_ ] ] ] ] ] ] ]
22+
whileTrue: [ stream next ].
23+
content := stream contents copyFrom: start + 1 to: stream position.
24+
^ SmaCCToken value: content
25+
]
26+
27+
{ #category : 'private - scanning' }
28+
BioPhylipScanner >> scanNumber [
29+
| start num |
30+
start := stream position.
31+
[ stream atEnd not and: [ stream peek isDigit ] ] whileTrue: [ stream next ].
32+
stream skipSeparators.
33+
num := stream contents copyFrom: start + 1 to: stream position.
34+
^ SmaCCToken value: num
35+
]
36+
37+
{ #category : 'private - scanning' }
38+
BioPhylipScanner >> scanToken [
39+
| char |
40+
stream atEnd ifTrue: [ ^ nil ].
41+
stream skipSeparators.
42+
stream atEnd ifTrue: [ ^ nil ].
43+
char := stream peek.
44+
^ char isDigit
45+
ifTrue: [ self scanNumber ]
46+
ifFalse: [
47+
char isLetter or: [ char = $. or: [ char = $- or: [ char = $? ] ] ]
48+
ifTrue: [ self scanNameOrSequence ]
49+
ifFalse: [
50+
char = Character lf or: [ char = Character cr ]
51+
ifTrue: [ stream next. self scanToken ]
52+
ifFalse: [ SmaCCToken value: stream next asString ] ] ]
53+
]
54+
55+
{ #category : 'accessing' }
56+
BioPhylipScanner >> stream: aStream [
57+
stream := aStream
58+
]
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
Class {
2+
#name : 'BioPhylipSmaCCParser',
3+
#superclass : 'SmaCCParser',
4+
#category : 'BioParsers-PHYLIP',
5+
#package : 'BioParsers',
6+
#tag : 'PHYLIP'
7+
}
8+
9+
{ #category : 'instance creation' }
10+
BioPhylipSmaCCParser class >> on: aStream [
11+
^ self new scanner: (BioPhylipScanner on: aStream); yourself
12+
]
13+
14+
{ #category : 'accessing' }
15+
BioPhylipSmaCCParser >> firstLineTokenizer [
16+
17+
^ BioPhylipTokenizerWrapper new block: [ :input |
18+
| stream numTaxa numChars |
19+
stream := input readStream.
20+
stream skipSeparators.
21+
numTaxa := Integer readFrom: stream.
22+
stream skipSeparators.
23+
numChars := Integer readFrom: stream.
24+
Array with: numTaxa printString with: numChars printString ]
25+
]
26+
27+
{ #category : 'accessing' }
28+
BioPhylipSmaCCParser >> parse [
29+
30+
| numTaxa numChars taxaNames sequences line |
31+
numTaxa := self parseNumber.
32+
numChars := self parseNumber.
33+
taxaNames := OrderedCollection new.
34+
sequences := OrderedCollection new.
35+
36+
[ scanner stream atEnd ] whileFalse: [
37+
line := scanner stream nextLine.
38+
line ifNotNil: [
39+
line := line trimBoth.
40+
line ifNotEmpty: [
41+
| name seq |
42+
name := line size <= 10
43+
ifTrue: [ line ]
44+
ifFalse: [ (line copyFrom: 1 to: 10) trimRight ].
45+
seq := line size <= 10
46+
ifTrue: [ String empty ]
47+
ifFalse: [
48+
(line copyFrom: 11 to: line size) trimBoth reject: [
49+
:c | c = Character space ] ].
50+
taxaNames add: name.
51+
seq ifNotEmpty: [ sequences add: seq ] ] ] ].
52+
53+
^ Array
54+
with: numTaxa
55+
with: numChars
56+
with: taxaNames
57+
with: sequences
58+
]
59+
60+
{ #category : 'parsing' }
61+
BioPhylipSmaCCParser >> parseInterleaved [
62+
63+
| numTaxa numChars taxaNames sequences line name seq padding |
64+
numTaxa := self parseNumber.
65+
numChars := self parseNumber.
66+
taxaNames := OrderedCollection new: numTaxa.
67+
sequences := OrderedCollection new: numTaxa.
68+
padding := String new: 10 withAll: Character space.
69+
70+
"Read first block with species names"
71+
1 to: numTaxa do: [ :i |
72+
line := scanner stream nextLine trimBoth.
73+
name := line size <= 10
74+
ifTrue: [ line , (padding copyFrom: 1 to: 10 - line size) ]
75+
ifFalse: [ line copyFrom: 1 to: 10 ].
76+
seq := line size <= 10
77+
ifTrue: [ '' ]
78+
ifFalse: [ line copyFrom: 11 to: line size ].
79+
taxaNames add: name.
80+
sequences add: (seq reject: [ :c | c = Character space ]) ].
81+
82+
"Read subsequent blocks"
83+
[ scanner stream atEnd ] whileFalse: [
84+
line := scanner stream nextLine.
85+
line ifNotNil: [
86+
line := line trimBoth.
87+
line ifNotEmpty: [
88+
1 to: numTaxa do: [ :i |
89+
scanner stream atEnd ifFalse: [
90+
line := scanner stream nextLine trimBoth.
91+
line ifNotEmpty: [
92+
seq := line reject: [ :c | c = Character space ].
93+
sequences at: i put: (sequences at: i) , seq ] ] ] ] ] ].
94+
95+
^ Array
96+
with: numTaxa
97+
with: numChars
98+
with: taxaNames
99+
with: sequences
100+
]
101+
102+
{ #category : 'parsing' }
103+
BioPhylipSmaCCParser >> parseNameAndSequence: aLine into: nameBlock and: seqBlock [
104+
105+
| name seq |
106+
aLine size <= 10
107+
ifTrue: [
108+
name := aLine trimRight.
109+
seq := '' ]
110+
ifFalse: [
111+
name := (aLine copyFrom: 1 to: 10) trimRight.
112+
seq := (aLine copyFrom: 11 to: aLine size) trimBoth.
113+
seq := seq copyReplaceAll: ' ' with: '' ].
114+
nameBlock value: name.
115+
seqBlock value: seq
116+
]
117+
118+
{ #category : 'parsing' }
119+
BioPhylipSmaCCParser >> parseNumber [
120+
121+
| token |
122+
scanner stream skipSeparators.
123+
token := '' writeStream.
124+
[ scanner stream atEnd not and: [ scanner stream peek isDigit ] ]
125+
whileTrue: [ token nextPut: scanner stream next ].
126+
^ token contents asInteger
127+
]
128+
129+
{ #category : 'parsing' }
130+
BioPhylipSmaCCParser >> parseNumberFrom: stream [
131+
132+
| token |
133+
stream skipSeparators.
134+
token := String new writeStream.
135+
[ stream atEnd not and: [ stream peek isDigit ] ] whileTrue: [
136+
token nextPut: stream next ].
137+
^ token contents asInteger
138+
]
139+
140+
{ #category : 'parsing' }
141+
BioPhylipSmaCCParser >> parseSpeciesBlocksFrom: stream [
142+
143+
| results line name seq padding |
144+
results := OrderedCollection new.
145+
padding := String new: 10 withAll: Character space.
146+
[ stream atEnd ] whileFalse: [
147+
line := stream nextLine.
148+
line ifNotNil: [
149+
line := line trimBoth.
150+
line ifNotEmpty: [
151+
name := line size <= 10
152+
ifTrue: [
153+
line , (padding copyFrom: 1 to: 10 - line size) ]
154+
ifFalse: [ line copyFrom: 1 to: 10 ].
155+
seq := line size <= 10
156+
ifTrue: [ '' ]
157+
ifFalse: [ line copyFrom: 11 to: line size ].
158+
results add: (Array with: name with: seq with: nil) ] ] ].
159+
^ results
160+
]
161+
162+
{ #category : 'parsing' }
163+
BioPhylipSmaCCParser >> parseSpeciesLineFrom: aString [
164+
165+
| line name seq padding |
166+
line := aString.
167+
padding := String new: 10 withAll: Character space.
168+
name := line size <= 10
169+
ifTrue: [ line , (padding copyFrom: 1 to: 10 - line size) ]
170+
ifFalse: [ line copyFrom: 1 to: 10 ].
171+
seq := line size <= 10
172+
ifTrue: [ '' ]
173+
ifFalse: [
174+
(line copyFrom: 11 to: line size) trimBoth reject: [ :c |
175+
c = Character space ] ].
176+
^ Array with: name with: seq
177+
]
178+
179+
{ #category : 'accessing-dna' }
180+
BioPhylipSmaCCParser >> speciesDNALineTokenizer [
181+
182+
^ BioPhylipTokenizerWrapper new block: [ :input |
183+
self parseSpeciesLineFrom: input ]
184+
]
185+
186+
{ #category : 'accessing-dna' }
187+
BioPhylipSmaCCParser >> speciesDNANamedBlockTokenizer [
188+
189+
^ BioPhylipTokenizerWrapper new block: [ :input |
190+
self parseSpeciesBlocksFrom: input readStream ]
191+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Class {
2+
#name : 'BioPhylipTokenizerWrapper',
3+
#superclass : 'Object',
4+
#instVars : [
5+
'block'
6+
],
7+
#category : 'BioParsers-PHYLIP',
8+
#package : 'BioParsers',
9+
#tag : 'PHYLIP'
10+
}
11+
12+
{ #category : 'accessing' }
13+
BioPhylipTokenizerWrapper >> block: aBlock [
14+
block := aBlock
15+
]
16+
17+
{ #category : 'parsing' }
18+
BioPhylipTokenizerWrapper >> parse: input [
19+
^ block value: input
20+
]

0 commit comments

Comments
 (0)