boalang · nmtiwari · Jan 31, 2017 · Feb 1, 2017 · Feb 2, 2017 · Feb 2, 2017
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/examples/naive.boa b/examples/naive.boa
@@ -0,0 +1,172 @@
+p: Project = input;
+type fv = {a:int, b:int, c:int, d:int};
+type stats = {a_stat:float, b_stat:float, c_stat:float};
+type complete_stat = {avg: stats, dev: stats};
+type Data = {training: fv, testing: fv};
+splitRatio : float = 0.67;
+
+naive := function(vals : array of Data) : float {
+   train : array of fv;
+   test : array of fv;
+
+    spearated: map[int] of array of fv; # classified per value
+    summaries : map[int] of complete_stat;
+
+   # separate the training and testing datasets
+    foreach(i:int; def(vals[i])) {
+        if(def(train)) {
+           train = train + {vals[i].training};
+        } else {
+           train = {vals[i].training};
+        }
+        if(def(test)) {
+           test = test+ {vals[i].testing};
+        } else {
+           test =  {vals[i].testing};
+        }
+
+    }
+
+
+    # classify training datasets
+    foreach(i:int; def(train[i])) {
+        temp : array of fv = {train[i]};
+        if(!haskey(spearated, train[i].d)) {
+          spearated[train[i].d] = temp;
+        } else {
+          spearated[train[i].d] = spearated[train[i].d] + temp;
+        }
+    }
+
+    # all the classes
+    classes : array of int = keys(spearated);
+
+    # summarize data from training dataset
+    foreach(i:int; def(classes[i])) {
+          # calculate mean
+          feature_mean : stats = {0.0, 0.0, 0.0};
+          foreach(j:int; def(spearated[classes[i]][j])) {
+              feature_mean.a_stat = feature_mean.a_stat + spearated[classes[i]][j].a;
+              feature_mean.b_stat = feature_mean.b_stat + spearated[classes[i]][j].b;
+              feature_mean.c_stat = feature_mean.c_stat + spearated[classes[i]][j].c;
+          }
+          feature_mean.a_stat = feature_mean.a_stat / len(spearated[classes[i]]);
+          feature_mean.b_stat = feature_mean.b_stat / len(spearated[classes[i]]);
+          feature_mean.c_stat = feature_mean.c_stat / len(spearated[classes[i]]);
+
+
+          # calculate sd
+          feature_sd : stats = {0.0, 0.0, 0.0};
+          foreach(j:int; def(spearated[classes[i]][j])) {
+             feature_sd.a_stat = feature_sd.a_stat + (spearated[classes[i]][j].a - feature_mean.a_stat);
+             feature_sd.b_stat = feature_sd.b_stat + (spearated[classes[i]][j].b - feature_mean.b_stat);
+             feature_sd.c_stat = feature_sd.c_stat + (spearated[classes[i]][j].c - feature_mean.c_stat);
+          }
+          feature_sd.a_stat = sqrt(feature_sd.a_stat / len(spearated[classes[i]]));
+          feature_sd.b_stat = sqrt(feature_sd.b_stat / len(spearated[classes[i]]));
+          feature_sd.c_stat = sqrt(feature_sd.c_stat / len(spearated[classes[i]]));
+
+          # summarized a class
+          summaries[classes[i]] = {feature_mean, feature_sd};
+    }
+
+
+    predictions: array of int;
+    predictions = new(predictions, len(test), -1);
+
+    # predict for each test data
+    foreach(i:int; def(test[i])) {
+        probabilities : map[int] of float;
+        foreach(j: int; def(classes[j])) {
+          probabilities[classes[j]] = 1.0;
+          mean := summaries[classes[j]].avg;
+          deviation := summaries[classes[j]].dev;
+          probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].a) - mean.a_stat, 2))/(2 * pow(deviation.a_stat, 2)))));
+          probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].b) - mean.b_stat, 2))/(2 * pow(deviation.b_stat, 2)))));
+          probabilities[classes[j]] = probabilities[classes[j]] * (1/ (sqrt(2 * 3.14) * deviation.a_stat)) * (exp(-1 * ((pow((1.0 * test[i].c) - mean.c_stat, 2))/(2 * pow(deviation.c_stat, 2)))));
+       }
+
+        bestProb : float = 0;
+        bestLab : int = -1;
+        foreach(j: int; def(classes[j])) {
+          if ((bestLab == -1) || (bestProb < probabilities[classes[j]])) {
+            bestProb = probabilities[classes[j]];
+            bestLab = classes[j];
+          }
+        }
+        predictions[i] = bestLab;
+    }
+
+    correct : float = 0.0;
+    foreach(i:int; def(test[i])) {
+        if(predictions[i] == test[i].d) {
+          correct = correct + 1.0;
+        }
+    }
+    return correct/len(test) * 100;
+};
+
+scale := function(ast: int, method: int, class: int) : int {
+    total : int = 0;
+    if(ast > 1000) {
+       total++;
+    } if(method > 500) {
+        total++;
+    } if(class > 50) {
+      total++;
+    }
+    return total;
+};
+
+
+naive_bayes : output naive of Data;
+
+# count ast nodes
+
+astCount := 0;
+classCount := 0;
+methodCount := 0;
+visit(p, visitor {
+	# only look at the latest snapshot
+	before n: CodeRepository -> {
+		snapshot := getsnapshot(n);
+		foreach (i: int; def(snapshot[i]))
+			visit(snapshot[i]);
+		stop;
+	}
+	before node: Declaration -> {
+		if (node.kind == TypeKind.CLASS) {
+            classCount++;
+            foreach (i: int; node.methods[i]) {
+                methodCount++;
+            }
+		}
+	}
+	# by default, count all visited nodes
+	before _ -> astCount++;
+	# these nodes are not part of the AST, so do nothing when visiting
+	before Project, ChangedFile -> ;
+});
+
+
+
+dummy : fv = {0, 0, 0, 0};
+nondummy : fv = {astCount, methodCount, classCount, scale(astCount, methodCount, classCount)};
+data1: Data = {nondummy, dummy};
+data2: Data = {dummy, nondummy};
+if(rand() > splitRatio)
+    naive_bayes << data1;
+else
+    naive_bayes << data2;
+
+
+if(rand() > splitRatio)
+    naive_bayes << data1;
+else
+    naive_bayes << data2;
+
+
+if(rand() > splitRatio)
+    naive_bayes << data1;
+else
+    naive_bayes << data2;
diff --git a/lib/gson-2.8.0.jar b/lib/gson-2.8.0.jar
diff --git a/lib/guava-21.0.jar b/lib/guava-21.0.jar
diff --git a/lib/jama-1.0.3.jar b/lib/jama-1.0.3.jar
diff --git a/src/antlr/Boa.g b/src/antlr/Boa.g
@@ -179,7 +179,7 @@ outputType returns [OutputType ast]
 	locals [int l, int c]
 	@init { $l = getStartLine(); $c = getStartColumn(); }
 	@after { $ast.setPositions($l, $c, getEndLine(), getEndColumn()); }
-	: OUTPUT (tk=SET { $ast = new OutputType(new Identifier($tk.text)); } | id=identifier { $ast = new OutputType($id.ast); }) (LPAREN el=expressionList RPAREN { $ast.setArgs($el.list); })? (LBRACKET m=component RBRACKET { $ast.addIndice($m.ast); })* OF m=component { $ast.setType($m.ast); } (WEIGHT m=component { $ast.setWeight($m.ast); })? (FORMAT LPAREN el=expressionList RPAREN)?
+	: OUTPUT (tk=SET { $ast = new OutputType(new Identifier($tk.text)); } | id=identifier { $ast = new OutputType($id.ast); })  (LPAREN vl=vardeclList RPAREN { $ast.setParams($vl.list); })? (LPAREN el=expressionList RPAREN { $ast.setArgs($el.list); })? (LBRACKET m=component RBRACKET { $ast.addIndice($m.ast); })* OF m=component { $ast.setType($m.ast); } (WEIGHT m=component { $ast.setWeight($m.ast); })? (FORMAT LPAREN el=expressionList RPAREN)?
 	;
 
 functionType returns [FunctionType ast]
@@ -429,6 +429,20 @@ expressionList returns [ArrayList<Expression> list]
 	| e=expression { $list.add($e.ast); } ({ notifyErrorListeners("error: ',' expected"); } e=expression { $list.add($e.ast); } | COMMA e=expression { $list.add($e.ast); })*
 	;
 
+
+useraggParamDeclaration returns [VarDeclStatement ast]
+	locals [int l, int c]
+	@init { $l = getStartLine(); $c = getStartColumn(); }
+	@after { $ast.setPositions($l, $c, getEndLine(), getEndColumn()); }
+	: v=forVariableDeclaration  { $ast = $v.ast; }
+	;
+
+vardeclList returns [ArrayList<VarDeclStatement> list]
+	@init { $list = new ArrayList<VarDeclStatement>(); }
+	: e=useraggParamDeclaration { $list.add($e.ast); } (COMMA e=useraggParamDeclaration   { $list.add($e.ast); })*
+	| e=useraggParamDeclaration { $list.add($e.ast); } ({ notifyErrorListeners("error: ',' expected"); } e=useraggParamDeclaration { $list.add($e.ast); } | COMMA e=useraggParamDeclaration { $list.add($e.ast); })*
+	;
+
 conjunction returns [Conjunction ast]
 	locals [int l, int c]
 	@init { $l = getStartLine(); $c = getStartColumn(); }

diff --git a/src/java/boa/BoaEnumInterface.java b/src/java/boa/BoaEnumInterface.java
@@ -0,0 +1,5 @@
+package boa;
+
+public interface BoaEnumInterface {
+    Object getValue();
+}
diff --git a/src/java/boa/BoaTup.java b/src/java/boa/BoaTup.java
@@ -0,0 +1,14 @@
+package boa;
+
+import java.io.IOException;
+import java.util.Collection;
+
+
+public interface BoaTup {
+    public String[] getValues();
+    public byte[] serialize(Object o) throws IOException;
+    public Object getValue(String f);
+    public String toString();
+    public <T> T[] asArray(T[] type);
+    public String[] getFieldNames();
+}
diff --git a/src/java/boa/aggregators/Aggregator.java b/src/java/boa/aggregators/Aggregator.java
@@ -18,10 +18,15 @@
 
 import java.io.IOException;
 
+import boa.BoaTup;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer.Context;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
 import boa.functions.BoaCasts;
 import boa.io.EmitKey;
 import boa.io.EmitValue;
@@ -38,6 +43,29 @@ public abstract class Aggregator {
 	private Context context;
 	private EmitKey key;
 	private boolean combining;
+	private final static Set<String> inBuiltAggs = new HashSet<String>();
+
+	static {
+		inBuiltAggs.add("sum");
+		inBuiltAggs.add("top");
+		inBuiltAggs.add("maximum");
+		inBuiltAggs.add("minimum");
+		inBuiltAggs.add("max");
+		inBuiltAggs.add("min");
+		inBuiltAggs.add("collection");
+		inBuiltAggs.add("mean");
+		inBuiltAggs.add("median");
+		inBuiltAggs.add("stdev");
+		inBuiltAggs.add("quantile");
+		inBuiltAggs.add("kurtosis");
+		inBuiltAggs.add("histogram");
+		inBuiltAggs.add("graphCSV");
+		inBuiltAggs.add("set");
+		inBuiltAggs.add("bottom");
+		inBuiltAggs.add("skewness");
+		inBuiltAggs.add("confidence");
+		inBuiltAggs.add("variance");
+	}
 
 	/**
 	 * Construct an Aggregator.
@@ -93,6 +121,13 @@ public void aggregate(final double data) throws IOException, InterruptedExceptio
 		this.aggregate(BoaCasts.doubleToString(data), null);
 	}
 
+	public void aggregate(final BoaTup data, final String metadata) throws IOException, InterruptedException, FinishedException {
+	}
+
+	public void aggregate(final BoaTup data) throws IOException, InterruptedException, FinishedException {
+		this.aggregate(data, null);
+	}
+
 	@SuppressWarnings("unchecked")
 	protected void collect(final String data, final String metadata) throws IOException, InterruptedException {
 		if (this.combining)
@@ -107,6 +142,22 @@ protected void collect(final String data) throws IOException, InterruptedExcepti
 		this.collect(data, null);
 	}
 
+	protected void collect(final BoaTup data) throws IOException, InterruptedException {
+		this.collect(data.toString(), null);
+	}
+
+	protected void collect(final BoaTup[] data) throws IOException, InterruptedException {
+		this.collect(Arrays.toString(data), null);
+	}
+
+	protected void collect(final double[] data) throws IOException, InterruptedException {
+		this.collect(Arrays.toString(data), null);
+	}
+
+	protected void collect(final long[] data) throws IOException, InterruptedException {
+		this.collect(Arrays.toString(data), null);
+	}
+
 	@SuppressWarnings("unchecked")
 	protected void collect(final long data, final String metadata) throws IOException, InterruptedException {
 		this.collect(BoaCasts.longToString(data), metadata);
@@ -157,4 +208,8 @@ public void setKey(final EmitKey key) {
 	public EmitKey getKey() {
 		return this.key;
 	}
+
+	public final static boolean isUserDefinedAggregator(String name) {
+		return !Aggregator.inBuiltAggs.contains(name);
+	}
 }
diff --git a/src/java/boa/aggregators/UserDefinedAggregator.java b/src/java/boa/aggregators/UserDefinedAggregator.java
@@ -0,0 +1,25 @@
+package boa.aggregators;
+
+
+import boa.compiler.UserDefinedAggregators;
+import boa.datagen.util.FileIO;
+import com.google.gson.Gson;
+
+import java.io.*;
+
+@AggregatorSpec(name = "UserDefinedAgg", formalParameters = { "any", "any" }, type = "UserDefined", canCombine = false)
+public abstract class UserDefinedAggregator extends Aggregator {
+
+	public void store(Object object) {
+		Gson json = new Gson();
+		File output = new File(UserDefinedAggregators.getFileName());
+		final String dest= output.getAbsolutePath() + "/";
+		output.mkdir();
+		writeAsJSON(object, dest + UserDefinedAggregators.getFileName() + ".model");
+	}
+
+	private void writeAsJSON(Object object, String path) {
+		Gson writer = new Gson();
+		FileIO.writeFileContents(new File(path), writer.toJson(object));
+	}
+}
diff --git a/src/java/boa/compiler/BoaCompiler.java b/src/java/boa/compiler/BoaCompiler.java
@@ -155,6 +155,8 @@ public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int
 
 					try {
 						if (!parserErrorListener.hasError) {
+							UserDefinedAggregators.setFileName(f.getName());
+							UserDefinedAggregators.setJobName("Job" + jobName);
 							new TypeCheckingVisitor().start(p, new SymbolTable());
 
 							final TaskClassifyingVisitor simpleVisitor = new TaskClassifyingVisitor();
@@ -241,6 +243,7 @@ public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int
 			st.add("jobs", jobs);
 			st.add("jobnames", jobnames);
 			st.add("combineTables", CodeGeneratingVisitor.combineAggregatorStrings);
+			st.add("userDeclAgg", CodeGeneratingVisitor.userAggregatorDeclStrings);
 			st.add("reduceTables", CodeGeneratingVisitor.reduceAggregatorStrings);
 			st.add("splitsize", isSimple ? 64 * 1024 * 1024 : 10 * 1024 * 1024);
 			if(DefaultProperties.localDataPath != null) {