Skip to content

Commit 5897a67

Browse files
authored
[Refactor:Plagiarism] Rebranding Lichen (#64)
* renaming sequence length to hash size and prior term to other gradeables * suggested edits * linting
1 parent 95b77ef commit 5897a67

File tree

10 files changed

+52
-52
lines changed

10 files changed

+52
-52
lines changed

bin/concatenate_all.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def validate(config, args):
7777
regex_dirs = config["regex_dirs"]
7878
language = config["language"]
7979
threshold = int(config["threshold"])
80-
sequence_length = int(config["sequence_length"])
81-
prior_term_gradeables = config["prior_term_gradeables"]
80+
hash_size = int(config["hash_size"])
81+
other_gradeables = config["other_gradeables"]
8282

8383
# Check we have a tokenizer to support the configured language
8484
langs_data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
@@ -87,30 +87,30 @@ def validate(config, args):
8787
if language not in langs_data:
8888
raise SystemExit(f"ERROR! tokenizing not supported for language {language}")
8989

90-
# Check values of common code threshold and sequence length
90+
# Check values of common code threshold and hash size
9191
if (threshold < 2):
9292
raise SystemExit("ERROR! threshold must be >= 2")
9393

94-
if (sequence_length < 1):
95-
raise SystemExit("ERROR! sequence_length must be >= 1")
94+
if (hash_size < 1):
95+
raise SystemExit("ERROR! hash_size must be >= 1")
9696

9797
# Check for backwards crawling
9898
for e in regex_patterns:
9999
if ".." in e:
100100
raise SystemExit('ERROR! Invalid path component ".." in regex')
101101

102-
for ptg in prior_term_gradeables:
103-
for field in ptg:
102+
for gradeable in other_gradeables:
103+
for field in gradeable:
104104
if ".." in field:
105-
raise SystemExit('ERROR! Invalid component ".." in prior_term_gradeable path')
105+
raise SystemExit('ERROR! Invalid component ".." in other_gradeable path')
106106

107-
# check permissions to make sure we have access to the prior term gradeables
107+
# check permissions to make sure we have access to the other gradeables
108108
my_course_group_perms = Path(args.basepath).group()
109-
for ptg in prior_term_gradeables:
110-
if Path(args.datapath, ptg["prior_semester"], ptg["prior_course"]).group()\
109+
for gradeable in other_gradeables:
110+
if Path(args.datapath, gradeable["other_semester"], gradeable["other_course"]).group()\
111111
!= my_course_group_perms:
112-
raise SystemExit(f"ERROR! Invalid permissions to access course {ptg['prior_semester']}"
113-
f"/{ptg['prior_course']}")
112+
raise SystemExit("ERROR! Invalid permissions to access course "
113+
f"{gradeable['other_semester']}/{gradeable['other_course']}")
114114

115115
# make sure the regex directory is one of the acceptable directories
116116
for dir in regex_dirs:
@@ -141,7 +141,7 @@ def main():
141141
version_mode = config["version"]
142142
regex_patterns = config["regex"]
143143
regex_dirs = config["regex_dirs"]
144-
prior_term_gradeables = config["prior_term_gradeables"]
144+
other_gradeables = config["other_gradeables"]
145145
users_to_ignore = config["ignore_submissions"]
146146

147147
# ==========================================================================
@@ -196,14 +196,14 @@ def main():
196196
checkTotalSize(total_concat)
197197

198198
# ==========================================================================
199-
# loop over all of the other prior term gradeables and concatenate their submissions
200-
for other_gradeable in prior_term_gradeables:
199+
# loop over all of the other gradeables and concatenate their submissions
200+
for other_gradeable in other_gradeables:
201201
for dir in regex_dirs:
202202
other_gradeable_path = os.path.join(args.datapath,
203-
other_gradeable["prior_semester"],
204-
other_gradeable["prior_course"],
203+
other_gradeable["other_semester"],
204+
other_gradeable["other_course"],
205205
dir,
206-
other_gradeable["prior_gradeable"])
206+
other_gradeable["other_gradeable"])
207207
# loop over each user
208208
for other_user in sorted(os.listdir(other_gradeable_path)):
209209
other_user_path = os.path.join(other_gradeable_path, other_user)
@@ -233,7 +233,7 @@ def main():
233233
continue
234234

235235
other_output_file_path = os.path.join(args.basepath, "other_gradeables",
236-
f"{other_gradeable['prior_semester']}__{other_gradeable['prior_course']}__{other_gradeable['prior_gradeable']}", # noqa: E501
236+
f"{other_gradeable['other_semester']}__{other_gradeable['other_course']}__{other_gradeable['other_gradeable']}", # noqa: E501
237237
other_user, other_version,
238238
"submission.concatenated")
239239

@@ -264,8 +264,8 @@ def main():
264264
f"for user {user} version {version}")
265265

266266
# do the same for the other gradeables
267-
for other_gradeable in prior_term_gradeables:
268-
other_gradeable_dir_name = f"{other_gradeable['prior_semester']}__{other_gradeable['prior_course']}__{other_gradeable['prior_gradeable']}" # noqa: E501
267+
for other_gradeable in other_gradeables:
268+
other_gradeable_dir_name = f"{other_gradeable['other_semester']}__{other_gradeable['other_course']}__{other_gradeable['other_gradeable']}" # noqa: E501
269269
for other_user in os.listdir(os.path.join(args.basepath, "other_gradeables",
270270
other_gradeable_dir_name)):
271271
other_user_path = os.path.join(args.basepath, "other_gradeables",

bin/hash_all.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def parse_args():
2121

2222
def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
2323
language = lichen_run_config["language"]
24-
sequence_length = int(lichen_run_config["sequence_length"])
24+
hash_size = int(lichen_run_config["hash_size"])
2525

2626
data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
2727
with open(data_json_path) as token_data_file:
@@ -37,8 +37,8 @@ def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
3737
num = len(tokens)
3838
# FIXME: this truncation should be adjusted after testing
3939
token_hashed_values = [(hashlib.md5(''.join(
40-
token_values[x:x+sequence_length]).encode())
41-
.hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
40+
token_values[x:x+hash_size]).encode())
41+
.hexdigest())[0:8] for x in range(0, num-hash_size+1)]
4242

4343
if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
4444
token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])] # noqa E501

bin/process_all.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ KILL_ERROR_MESSAGE="
1818
* An error occured while running Lichen. Your run was probably killed for *
1919
* exceeding the configured resource limits. Before rerunning, perhaps try any *
2020
* of the following edits to the configuration: *
21-
* - Increasing the sequence length *
21+
* - Increasing the hash size *
2222
* - Using only active version *
2323
* - Decreasing the common code threshold *
2424
* - Selecting fewer files to be compared *

bin/tokenize_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def main():
7171
tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file)
7272

7373
# ===========================================================================
74-
# tokenize the other prior term gradeables' submissions
74+
# tokenize the other other gradeables' submissions
7575
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
7676
if not os.path.isdir(other_gradeables_dir):
7777
raise SystemExit("ERROR! Unable to find other gradeables directory")

compare_hashes/compare_hashes.cpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ int main(int argc, char* argv[]) {
125125
config.semester = config_file_json.value("semester", "ERROR");
126126
config.course = config_file_json.value("course", "ERROR");
127127
config.gradeable = config_file_json.value("gradeable", "ERROR");
128-
config.sequence_length = config_file_json.value("sequence_length", 1);
128+
config.hash_size = config_file_json.value("hash_size", 1);
129129
config.threshold = config_file_json.value("threshold", 5);
130130

131131
// error checking, confirm there are hashes to work with
@@ -140,8 +140,8 @@ int main(int argc, char* argv[]) {
140140
boost::filesystem::path provided_code_file = lichen_gradeable_path / "provided_code" / "hashes.txt";
141141
// if file exists in that location, the provided code mode is enabled.
142142
config.provided_code_enabled = boost::filesystem::exists(provided_code_file);
143-
// path to prior gradeables' data
144-
boost::filesystem::path prior_terms_dir = lichen_gradeable_path / "other_gradeables";
143+
// path to other gradeables' data
144+
boost::filesystem::path other_gradeables_dir = lichen_gradeable_path / "other_gradeables";
145145

146146

147147
// ===========================================================================
@@ -153,7 +153,7 @@ int main(int argc, char* argv[]) {
153153
std::vector<Submission*> all_submissions;
154154
// Stores all hashes from the instructor provided code
155155
std::unordered_set<hash> provided_code;
156-
// stores all hashes from other prior term gradeables
156+
// stores all hashes from other gradeables
157157
std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
158158
// stores the highest match for every student, used later for generating overall_rankings.txt
159159
std::unordered_map<std::string, std::pair<int, float>> highest_matches;
@@ -171,10 +171,10 @@ int main(int argc, char* argv[]) {
171171
}
172172
}
173173

174-
// load prior gradeables' hashes
174+
// load other gradeables' hashes
175175
// iterate over all other gradeables
176176
boost::filesystem::directory_iterator end_iter;
177-
for (boost::filesystem::directory_iterator other_gradeable_itr(prior_terms_dir); other_gradeable_itr != end_iter; ++other_gradeable_itr) {
177+
for (boost::filesystem::directory_iterator other_gradeable_itr(other_gradeables_dir); other_gradeable_itr != end_iter; ++other_gradeable_itr) {
178178
boost::filesystem::path other_gradeable_path = other_gradeable_itr->path();
179179
assert (is_directory(other_gradeable_path));
180180
std::string other_gradeable_str = other_gradeable_itr->path().filename().string();
@@ -193,7 +193,7 @@ int main(int argc, char* argv[]) {
193193
version_number other_version = std::stoi(str_other_version);
194194
assert (other_version > 0);
195195

196-
// load the hashes from this prior submission
196+
// load the hashes from this submission from another gradeable
197197
boost::filesystem::path other_hash_file = other_version_path / "hashes.txt";
198198
std::ifstream istr(other_hash_file.string());
199199
assert(istr.good());
@@ -371,7 +371,7 @@ int main(int argc, char* argv[]) {
371371
std::vector<nlohmann::json> matchingpositions;
372372
nlohmann::json position;
373373
position["start"] = matching_positions_itr->location;
374-
position["end"] = matching_positions_itr->location + config.sequence_length - 1;
374+
position["end"] = matching_positions_itr->location + config.hash_size - 1;
375375
matchingpositions.push_back(position);
376376

377377
// search for all matching positions of the suspicious match in other submissions
@@ -393,7 +393,7 @@ int main(int argc, char* argv[]) {
393393

394394
if (matchingpositions.size() >= lichen_config["max_matching_positions"]) {
395395
std::cout << "Matching positions array truncated for user: [" << other["username"] << "] version: " << other["version"] << std::endl;
396-
std::cout << " - Try increasing the sequence length to fix this problem." << std::endl;
396+
std::cout << " - Try increasing the hash size to fix this problem." << std::endl;
397397
break;
398398
}
399399

@@ -403,7 +403,7 @@ int main(int argc, char* argv[]) {
403403
other["source_gradeable"] = matching_positions_itr->source_gradeable;
404404
}
405405
position["start"] = matching_positions_itr->location;
406-
position["end"] = matching_positions_itr->location + config.sequence_length - 1;
406+
position["end"] = matching_positions_itr->location + config.hash_size - 1;
407407
matchingpositions.push_back(position);
408408
}
409409
}
@@ -414,7 +414,7 @@ int main(int argc, char* argv[]) {
414414

415415
nlohmann::json info;
416416
info["start"] = location_itr->first;
417-
info["end"] = location_itr->first + config.sequence_length - 1;
417+
info["end"] = location_itr->first + config.hash_size - 1;
418418
info["type"] = "match";
419419
info["others"] = others;
420420

@@ -431,7 +431,7 @@ int main(int argc, char* argv[]) {
431431

432432
nlohmann::json info;
433433
info["start"] = *location_itr;
434-
info["end"] = *location_itr + config.sequence_length - 1;
434+
info["end"] = *location_itr + config.hash_size - 1;
435435
info["type"] = "common";
436436

437437
result.push_back(info);
@@ -447,7 +447,7 @@ int main(int argc, char* argv[]) {
447447

448448
nlohmann::json info;
449449
info["start"] = *location_itr;
450-
info["end"] = *location_itr + config.sequence_length - 1;
450+
info["end"] = *location_itr + config.hash_size - 1;
451451
info["type"] = "provided";
452452

453453
result.push_back(info);

compare_hashes/lichen_config.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ struct LichenConfig {
55
std::string semester;
66
std::string course;
77
std::string gradeable;
8-
int sequence_length;
8+
int hash_size;
99
int threshold;
1010
bool provided_code_enabled;
1111
};

compare_hashes/submission.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ float Submission::getPercentage() const {
1616

1717
void Submission::addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) {
1818
// figure out if there is an overlap between this hash and a common/provided match
19-
int sequence_length = config_.sequence_length;
20-
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
19+
int hash_size = config_.hash_size;
20+
for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
2121
if (common_matches.find(i) != common_matches.end() || provided_matches.find(i) != provided_matches.end()) {
2222
return;
2323
}
@@ -31,8 +31,8 @@ void Submission::addSuspiciousMatch(location_in_submission location, const HashL
3131

3232
void Submission::addCommonMatch(location_in_submission location) {
3333
// figure out if there is an overlap between this hash and a match
34-
int sequence_length = config_.sequence_length;
35-
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
34+
int hash_size = config_.hash_size;
35+
for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
3636
std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
3737
// if there is an overlap, remove the suspicious match that overlaps
3838
// hopefully this doesn't cause problems with other submissions thinking
@@ -47,8 +47,8 @@ void Submission::addCommonMatch(location_in_submission location) {
4747

4848
void Submission::addProvidedMatch(location_in_submission location) {
4949
// figure out if there is an overlap between this hash and a match
50-
int sequence_length = config_.sequence_length;
51-
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
50+
int hash_size = config_.hash_size;
51+
for (int i = location - 1; i > location - hash_size && i >= 0; i--) {
5252
std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
5353
// if there is an overlap, remove the suspicious match that overlaps
5454
// hopefully this doesn't cause problems with other submissions thinking

tests/data/hash_all/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
22
"language": "plaintext",
3-
"sequence_length": 2
3+
"hash_size": 2
44
}

tests/data/test_lichen/repeated_sequences/expected_output/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
],
1111
"language": "plaintext",
1212
"threshold": 5,
13-
"sequence_length": 4,
14-
"prior_term_gradeables": [],
13+
"hash_size": 4,
14+
"other_gradeables": [],
1515
"ignore_submissions": []
1616
}

tests/data/test_lichen/repeated_sequences/input/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
],
1111
"language": "plaintext",
1212
"threshold": 5,
13-
"sequence_length": 4,
14-
"prior_term_gradeables": [],
13+
"hash_size": 4,
14+
"other_gradeables": [],
1515
"ignore_submissions": []
1616
}

0 commit comments

Comments
 (0)