[Refactor:Plagiarism] Rebranding Lichen (#64)

sbelsk · web-flow · commit 5897a6790b8e · 2021-10-25T16:11:04.000-04:00
* renaming sequence length to hash size and prior term to other gradeables

* suggested edits

* linting
diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
@@ -77,8 +77,8 @@ def validate(config, args):
     regex_dirs = config["regex_dirs"]
     language = config["language"]
     threshold = int(config["threshold"])
-    sequence_length = int(config["sequence_length"])
-    prior_term_gradeables = config["prior_term_gradeables"]
+    hash_size = int(config["hash_size"])
+    other_gradeables = config["other_gradeables"]
 
     # Check we have a tokenizer to support the configured language
     langs_data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
@@ -87,30 +87,30 @@ def validate(config, args):
         if language not in langs_data:
             raise SystemExit(f"ERROR! tokenizing not supported for language {language}")
 
-    # Check values of common code threshold and sequence length
+    # Check values of common code threshold and hash size
     if (threshold < 2):
         raise SystemExit("ERROR! threshold must be >= 2")
 
-    if (sequence_length < 1):
-        raise SystemExit("ERROR! sequence_length must be >= 1")
+    if (hash_size < 1):
+        raise SystemExit("ERROR! hash_size must be >= 1")
 
     # Check for backwards crawling
     for e in regex_patterns:
         if ".." in e:
             raise SystemExit('ERROR! Invalid path component ".." in regex')
 
-    for ptg in prior_term_gradeables:
-        for field in ptg:
+    for gradeable in other_gradeables:
+        for field in gradeable:
             if ".." in field:
-                raise SystemExit('ERROR! Invalid component ".." in prior_term_gradeable path')
+                raise SystemExit('ERROR! Invalid component ".." in other_gradeable path')
 
-    # check permissions to make sure we have access to the prior term gradeables
+    # check permissions to make sure we have access to the other gradeables
     my_course_group_perms = Path(args.basepath).group()
-    for ptg in prior_term_gradeables:
-        if Path(args.datapath, ptg["prior_semester"], ptg["prior_course"]).group()\
+    for gradeable in other_gradeables:
+        if Path(args.datapath, gradeable["other_semester"], gradeable["other_course"]).group()\
            != my_course_group_perms:
-            raise SystemExit(f"ERROR! Invalid permissions to access course {ptg['prior_semester']}"
-                  f"/{ptg['prior_course']}")
+            raise SystemExit("ERROR! Invalid permissions to access course "
+                             f"{gradeable['other_semester']}/{gradeable['other_course']}")
 
     # make sure the regex directory is one of the acceptable directories
     for dir in regex_dirs:
@@ -141,7 +141,7 @@ def main():
     version_mode = config["version"]
     regex_patterns = config["regex"]
     regex_dirs = config["regex_dirs"]
-    prior_term_gradeables = config["prior_term_gradeables"]
+    other_gradeables = config["other_gradeables"]
     users_to_ignore = config["ignore_submissions"]
 
     # ==========================================================================
@@ -196,14 +196,14 @@ def main():
                 checkTotalSize(total_concat)
 
     # ==========================================================================
-    # loop over all of the other prior term gradeables and concatenate their submissions
-    for other_gradeable in prior_term_gradeables:
+    # loop over all of the other gradeables and concatenate their submissions
+    for other_gradeable in other_gradeables:
         for dir in regex_dirs:
             other_gradeable_path = os.path.join(args.datapath,
-                                                other_gradeable["prior_semester"],
-                                                other_gradeable["prior_course"],
+                                                other_gradeable["other_semester"],
+                                                other_gradeable["other_course"],
                                                 dir,
-                                                other_gradeable["prior_gradeable"])
+                                                other_gradeable["other_gradeable"])
             # loop over each user
             for other_user in sorted(os.listdir(other_gradeable_path)):
                 other_user_path = os.path.join(other_gradeable_path, other_user)
@@ -233,7 +233,7 @@ def main():
                         continue
 
                     other_output_file_path = os.path.join(args.basepath, "other_gradeables",
-                                                          f"{other_gradeable['prior_semester']}__{other_gradeable['prior_course']}__{other_gradeable['prior_gradeable']}",  # noqa: E501
+                                                          f"{other_gradeable['other_semester']}__{other_gradeable['other_course']}__{other_gradeable['other_gradeable']}",  # noqa: E501
                                                           other_user, other_version,
                                                           "submission.concatenated")
 
@@ -264,8 +264,8 @@ def main():
                           f"for user {user} version {version}")
 
     # do the same for the other gradeables
-    for other_gradeable in prior_term_gradeables:
-        other_gradeable_dir_name = f"{other_gradeable['prior_semester']}__{other_gradeable['prior_course']}__{other_gradeable['prior_gradeable']}"  # noqa: E501
+    for other_gradeable in other_gradeables:
+        other_gradeable_dir_name = f"{other_gradeable['other_semester']}__{other_gradeable['other_course']}__{other_gradeable['other_gradeable']}"  # noqa: E501
         for other_user in os.listdir(os.path.join(args.basepath, "other_gradeables",
                                                   other_gradeable_dir_name)):
             other_user_path = os.path.join(args.basepath, "other_gradeables",
diff --git a/bin/hash_all.py b/bin/hash_all.py
@@ -21,7 +21,7 @@ def parse_args():
 
 def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
     language = lichen_run_config["language"]
-    sequence_length = int(lichen_run_config["sequence_length"])
+    hash_size = int(lichen_run_config["hash_size"])
 
     data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
     with open(data_json_path) as token_data_file:
@@ -37,8 +37,8 @@ def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
                 num = len(tokens)
                 # FIXME: this truncation should be adjusted after testing
                 token_hashed_values = [(hashlib.md5(''.join(
-                    token_values[x:x+sequence_length]).encode())
-                    .hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
+                    token_values[x:x+hash_size]).encode())
+                    .hexdigest())[0:8] for x in range(0, num-hash_size+1)]
 
                 if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
                     token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])]  # noqa E501
diff --git a/bin/process_all.sh b/bin/process_all.sh
@@ -18,7 +18,7 @@ KILL_ERROR_MESSAGE="
 * An error occured while running Lichen. Your run was probably killed for       *
 * exceeding the configured resource limits. Before rerunning, perhaps try any   *
 * of the following edits to the configuration:                                  *
-* - Increasing the sequence length                                              *
+* - Increasing the hash size                                                    *
 * - Using only active version                                                   *
 * - Decreasing the common code threshold                                        *
 * - Selecting fewer files to be compared                                        *
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
@@ -71,7 +71,7 @@ def main():
             tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
 
     # ===========================================================================
-    # tokenize the other prior term gradeables' submissions
+    # tokenize the other other gradeables' submissions
     other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
     if not os.path.isdir(other_gradeables_dir):
         raise SystemExit("ERROR! Unable to find other gradeables directory")
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -125,7 +125,7 @@ int main(int argc, char* argv[]) {
   config.semester = config_file_json.value("semester", "ERROR");
   config.course = config_file_json.value("course", "ERROR");
   config.gradeable = config_file_json.value("gradeable", "ERROR");
-  config.sequence_length = config_file_json.value("sequence_length", 1);
+  config.hash_size = config_file_json.value("hash_size", 1);
   config.threshold = config_file_json.value("threshold", 5);
 
   // error checking, confirm there are hashes to work with
@@ -140,8 +140,8 @@ int main(int argc, char* argv[]) {
   boost::filesystem::path provided_code_file = lichen_gradeable_path / "provided_code" / "hashes.txt";
   // if file exists in that location, the provided code mode is enabled.
   config.provided_code_enabled = boost::filesystem::exists(provided_code_file);
-  // path to prior gradeables' data
-  boost::filesystem::path prior_terms_dir = lichen_gradeable_path / "other_gradeables";
+  // path to other gradeables' data
+  boost::filesystem::path other_gradeables_dir = lichen_gradeable_path / "other_gradeables";
 
 
   // ===========================================================================
@@ -153,7 +153,7 @@ int main(int argc, char* argv[]) {
   std::vector<Submission*> all_submissions;
   // Stores all hashes from the instructor provided code
   std::unordered_set<hash> provided_code;
-  // stores all hashes from other prior term gradeables
+  // stores all hashes from other gradeables
   std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
   // stores the highest match for every student, used later for generating overall_rankings.txt
   std::unordered_map<std::string, std::pair<int, float>> highest_matches;
@@ -171,10 +171,10 @@ int main(int argc, char* argv[]) {
     }
   }
 
-  // load prior gradeables' hashes
+  // load other gradeables' hashes
   // iterate over all other gradeables
   boost::filesystem::directory_iterator end_iter;
-  for (boost::filesystem::directory_iterator other_gradeable_itr(prior_terms_dir); other_gradeable_itr != end_iter; ++other_gradeable_itr) {
+  for (boost::filesystem::directory_iterator other_gradeable_itr(other_gradeables_dir); other_gradeable_itr != end_iter; ++other_gradeable_itr) {
     boost::filesystem::path other_gradeable_path = other_gradeable_itr->path();
     assert (is_directory(other_gradeable_path));
     std::string other_gradeable_str = other_gradeable_itr->path().filename().string();
@@ -193,7 +193,7 @@ int main(int argc, char* argv[]) {
         version_number other_version = std::stoi(str_other_version);
         assert (other_version > 0);
 
-        // load the hashes from this prior submission
+        // load the hashes from this submission from another gradeable
         boost::filesystem::path other_hash_file = other_version_path / "hashes.txt";
         std::ifstream istr(other_hash_file.string());
         assert(istr.good());
@@ -371,7 +371,7 @@ int main(int argc, char* argv[]) {
         std::vector<nlohmann::json> matchingpositions;
         nlohmann::json position;
         position["start"] = matching_positions_itr->location;
-        position["end"] = matching_positions_itr->location + config.sequence_length - 1;
+        position["end"] = matching_positions_itr->location + config.hash_size - 1;
         matchingpositions.push_back(position);
 
         // search for all matching positions of the suspicious match in other submissions
@@ -393,7 +393,7 @@ int main(int argc, char* argv[]) {
 
               if (matchingpositions.size() >= lichen_config["max_matching_positions"]) {
                 std::cout << "Matching positions array truncated for user: [" << other["username"] << "] version: " << other["version"] << std::endl;
-                std::cout << "  - Try increasing the sequence length to fix this problem." << std::endl;
+                std::cout << "  - Try increasing the hash size to fix this problem." << std::endl;
                 break;
               }
 
@@ -403,7 +403,7 @@ int main(int argc, char* argv[]) {
               other["source_gradeable"] = matching_positions_itr->source_gradeable;
             }
             position["start"] = matching_positions_itr->location;
-            position["end"] = matching_positions_itr->location + config.sequence_length - 1;
+            position["end"] = matching_positions_itr->location + config.hash_size - 1;
             matchingpositions.push_back(position);
           }
         }
@@ -414,7 +414,7 @@ int main(int argc, char* argv[]) {
 
       nlohmann::json info;
       info["start"] = location_itr->first;
-      info["end"] = location_itr->first + config.sequence_length - 1;
+      info["end"] = location_itr->first + config.hash_size - 1;
       info["type"] = "match";
       info["others"] = others;
 
@@ -431,7 +431,7 @@ int main(int argc, char* argv[]) {
 
       nlohmann::json info;
       info["start"] = *location_itr;
-      info["end"] = *location_itr + config.sequence_length - 1;
+      info["end"] = *location_itr + config.hash_size - 1;
       info["type"] = "common";
 
       result.push_back(info);
@@ -447,7 +447,7 @@ int main(int argc, char* argv[]) {
 
       nlohmann::json info;
       info["start"] = *location_itr;
-      info["end"] = *location_itr + config.sequence_length - 1;
+      info["end"] = *location_itr + config.hash_size - 1;
       info["type"] = "provided";
 
       result.push_back(info);
diff --git a/compare_hashes/lichen_config.h b/compare_hashes/lichen_config.h
@@ -5,7 +5,7 @@ struct LichenConfig {
     std::string semester;
     std::string course;
     std::string gradeable;
-    int sequence_length;
+    int hash_size;
     int threshold;
     bool provided_code_enabled;
 };
diff --git a/compare_hashes/submission.cpp b/compare_hashes/submission.cpp
@@ -16,8 +16,8 @@ float Submission::getPercentage() const {
 
 void Submission::addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) {
   // figure out if there is an overlap between this hash and a common/provided match
-  int sequence_length = config_.sequence_length;
-  for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
+  int hash_size = config_.hash_size;
+  for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
     if (common_matches.find(i) != common_matches.end() || provided_matches.find(i) != provided_matches.end()) {
       return;
     }
@@ -31,8 +31,8 @@ void Submission::addSuspiciousMatch(location_in_submission location, const HashL
 
 void Submission::addCommonMatch(location_in_submission location) {
   // figure out if there is an overlap between this hash and a match
-  int sequence_length = config_.sequence_length;
-  for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
+  int hash_size = config_.hash_size;
+  for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
     std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
     // if there is an overlap, remove the suspicious match that overlaps
     // hopefully this doesn't cause problems with other submissions thinking
@@ -47,8 +47,8 @@ void Submission::addCommonMatch(location_in_submission location) {
 
 void Submission::addProvidedMatch(location_in_submission location) {
   // figure out if there is an overlap between this hash and a match
-  int sequence_length = config_.sequence_length;
-  for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
+  int hash_size = config_.hash_size;
+  for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
     std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
     // if there is an overlap, remove the suspicious match that overlaps
     // hopefully this doesn't cause problems with other submissions thinking
diff --git a/tests/data/hash_all/config.json b/tests/data/hash_all/config.json
@@ -1,4 +1,4 @@
 {
     "language": "plaintext",
-    "sequence_length": 2
+    "hash_size": 2
 }
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/config.json b/tests/data/test_lichen/repeated_sequences/expected_output/config.json
@@ -10,7 +10,7 @@
     ],
     "language": "plaintext",
     "threshold": 5,
-    "sequence_length": 4,
-    "prior_term_gradeables": [],
+    "hash_size": 4,
+    "other_gradeables": [],
     "ignore_submissions": []
 }
diff --git a/tests/data/test_lichen/repeated_sequences/input/config.json b/tests/data/test_lichen/repeated_sequences/input/config.json
@@ -10,7 +10,7 @@
     ],
     "language": "plaintext",
     "threshold": 5,
-    "sequence_length": 4,
-    "prior_term_gradeables": [],
+    "hash_size": 4,
+    "other_gradeables": [],
     "ignore_submissions": []
 }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`	`2`	`"language": "plaintext",`
`3`		`- "sequence_length": 2`
	`3`	`+ "hash_size": 2`
`4`	`4`	`}`