1
+ #!/usr/bin/swift
2
+
1
3
import Foundation
2
4
3
5
guard #available( OSX 10 . 14 , * ) else {
@@ -7,54 +9,87 @@ guard #available(OSX 10.14, *) else {
7
9
import CreateML
8
10
import NaturalLanguage
9
11
10
- //let destinationPath = <#Path to Destination.mlmodel#>
12
+ enum ProgrammingLanguage : String {
13
+ case c = " C "
14
+ case cPlusPlus = " C++ "
15
+ case go = " Go "
16
+ case java = " Java "
17
+ case javaScript = " JavaScript "
18
+ case objectiveC = " Objective-C "
19
+ case php = " PHP "
20
+ case ruby = " Ruby "
21
+ case rust = " Rust "
22
+ case swift = " Swift "
23
+
24
+ init ? ( directory: String , fileExtension: String ? ) {
25
+ switch ( directory, fileExtension) {
26
+ case ( " c " , " h " ) , ( _, " c " ) : self = . c
27
+ case ( " cc " , " h " ) , ( _, " cc " ) , ( _, " cpp " ) : self = . cPlusPlus
28
+ case ( _, " go " ) : self = . go
29
+ case ( _, " java " ) : self = . java
30
+ case ( _, " js " ) : self = . javaScript
31
+ case ( " objective-c " , " h " ) , ( _, " m " ) : self = . objectiveC
32
+ case ( _, " php " ) : self = . php
33
+ case ( _, " rb " ) : self = . ruby
34
+ case ( _, " rs " ) : self = . rust
35
+ case ( _, " swift " ) : self = . swift
36
+ default :
37
+ return nil
38
+ }
39
+ }
40
+ }
11
41
12
- //let corpusPath = "<#Path to Corpus Directory#>"
42
+ let destinationPath = " /Users/mattt/Desktop/Classifier.mlmodel "
43
+
44
+ let corpusPath = " /Users/mattt/Downloads/code-corpora "
13
45
let corpusURL = URL ( fileURLWithPath: corpusPath)
14
46
15
47
let fileManager = FileManager . default
16
48
17
- try fileManager. contentsOfDirectory ( at: corpusURL, includingPropertiesForKeys: [ . isDirectoryKey] , options: . skipsHiddenFiles)
18
-
19
49
do {
20
- var corpus = try MLDataTable ( dictionary : [ " text " : [ " " ] , " label " : [ " " ] ] )
21
-
50
+ var corpus : [ ( text: String , label: String ) ] = [ ]
51
+
22
52
for directory in try fileManager. contentsOfDirectory ( at: corpusURL, includingPropertiesForKeys: [ . isDirectoryKey] , options: [ . skipsHiddenFiles] ) {
23
53
guard directory. hasDirectoryPath,
24
54
let enumerator = fileManager. enumerator ( at: directory, includingPropertiesForKeys: [ . isDirectoryKey] )
25
55
else {
26
56
continue
27
57
}
28
-
58
+
29
59
for case let resource as URL in enumerator {
30
60
guard !resource. hasDirectoryPath,
31
61
let language = ProgrammingLanguage ( directory: directory. lastPathComponent, fileExtension: resource. pathExtension) ,
32
62
let text = try ? String ( contentsOf: resource)
33
63
else {
34
64
continue
35
65
}
36
-
37
- let dataTable = try MLDataTable ( dictionary: [ " text " : text, " label " : language. description] )
38
- corpus. append ( contentsOf: dataTable)
66
+ corpus. append ( ( text: text, label: language. rawValue) )
39
67
}
40
68
}
41
-
42
- let ( trainingData, testingData) = corpus. randomSplit ( by: 0.9 , seed: 0 )
43
-
69
+
70
+ let ( texts, labels) = corpus. reduce ( into: ( [ String] ( ) , [ String] ( ) ) ) {
71
+ $0. 0 . append ( $1. text)
72
+ $0. 1 . append ( $1. label)
73
+ }
74
+
75
+ let dataTable = try MLDataTable ( dictionary: [ " text " : texts, " label " : labels] )
76
+
77
+ let ( trainingData, testingData) = dataTable. randomSplit ( by: 0.9 , seed: 0 )
78
+
44
79
// As of Xcode 10.0 beta (10L176w),
45
80
// attempted use of CRF algorithm results in EXC_BAD_ACCESS.
46
81
/*
47
82
let parameters = MLTextClassifier.ModelParameters(validationData: validationData, algorithm: .crf(revision: 1), language: .english)
48
83
let classifier = try MLTextClassifier(trainingData: trainingData, textColumn: "text", labelColumn: "label", parameters: parameters)
49
84
*/
50
-
85
+
51
86
let classifier = try MLTextClassifier ( trainingData: trainingData, textColumn: " text " , labelColumn: " label " )
52
-
87
+
53
88
classifier. modelParameters. algorithm
54
-
89
+
55
90
let evaluation = classifier. evaluation ( on: testingData)
56
91
print ( evaluation)
57
-
92
+
58
93
let modelPath = URL ( fileURLWithPath: destinationPath)
59
94
try classifier. write ( to: modelPath)
60
95
} catch {
0 commit comments