diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index a2e49f0..f4bfecb 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -14,7 +14,9 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2021 by Thomas Bock +# Copyright 2026 by Thomas Bock # Copyright 2022 by Christian Hechtl +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to anonymize authors and issue titles after the extraction from the Codeface database was performed. @@ -29,14 +31,14 @@ import sys from os import path, walk, makedirs from os.path import abspath -from shutil import copy - -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from logging import getLogger +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +# create logger +setup_logging() +log = getLogger(__name__) ## # RUN POSTPROCESSING @@ -104,13 +106,13 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary) if orig_author == "Deleted user" and orig_email == "ghost@github.com": - if not (orig_author, orig_email) in author_to_anonymized_author: + if (orig_author, orig_email) not in author_to_anonymized_author: author_to_anonymized_author[(orig_author, orig_email)] = (orig_author, orig_email) else: # check whether (name, e-mail) pair isn't already present in the dictionary - if not (orig_author, orig_email) in author_to_anonymized_author: + if (orig_author, orig_email) not in author_to_anonymized_author: # check if just the name (without e-mail address) isn't already present in the dictionary - if not orig_author in author_to_anonymized_author: + if orig_author not in author_to_anonymized_author: # if the author has an empty name, only anonymize their e-mail address if not author[1] == "": author[1] = ("developer" + str(i)) @@ -141,7 +143,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # Check for all files in the result directory of the project whether they need to be anonymized - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Anonymize authors lists if authors_list in filenames: @@ -170,7 +172,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # anonymize authors author_data, i, author_to_anonymized_author = \ anonymize_authors(author_data, i, author_to_anonymized_author) - + author_data_gender, i_gender, author_to_anonymized_author_gender = \ anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True) @@ -334,7 +336,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F makedirs(path.dirname(output_path)) log.info("Write anonymized data to %s ...", output_path) csv_writer.write_to_csv(output_path, bot_data) - + # (8) Anonymize gender list if gender_list in filenames: f = path.join(filepath, gender_list) @@ -343,7 +345,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F gender_data_new = [] for author in gender_data: - if author[0] in author_to_anonymized_author_gender.keys(): + if author[0] in list(author_to_anonymized_author_gender.keys()): new_person = author_to_anonymized_author_gender[author[0]] author[0] = new_person[0] gender_data_new.append(author) @@ -395,7 +397,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) # load configuration __conf = Configuration.load(__codeface_conf, __project_conf) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 13b1e38..2b54ef7 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,6 +14,8 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock +# Copyright 2026 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually @@ -42,13 +44,14 @@ from os import path, walk, makedirs from os.path import abspath from shutil import copy +from logging import getLogger -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager - +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +# create logger +setup_logging() +log = getLogger(__name__) ## # RUN POSTPROCESSING @@ -67,7 +70,7 @@ def perform_data_backup(results_path, results_path_backup): log.info("Backup folder already exists. No backup is to be performed.") return - for filepath, dirnames, filenames in walk(results_path): + for filepath, _, filenames in walk(results_path): for filename in filenames: if filename.endswith(".list"): current_file = path.join(filepath, filename) @@ -119,7 +122,7 @@ def is_github_noreply_author(name, email): # Check for all files in the result directory of the project whether they need to be adjusted - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Remove author 'GitHub ' from authors list if authors_list in filenames: @@ -148,7 +151,7 @@ def is_github_noreply_author(name, email): if not is_github_noreply_author(email[0], email[1]): email_data_new.append(email) else: - log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) + log.warning("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) csv_writer.write_to_csv(f, email_data_new) @@ -198,19 +201,19 @@ def is_github_noreply_author(name, email): # ignore merge commits in the commit data, we consistently ignore them also if they are added # to a pull request. Hence, the corresponding "commit_added" event will be removed now (i.e., # not added to the new issue data any more). - log.warn("Commit %s is added in the GitHub issue data, but not part of the commit data. " + - "Remove the corresponding 'commit_added' event from the issue data...", commit_hash) + log.warning("Commit %s is added in the GitHub issue data, but not part of the commit data. " + + "Remove the corresponding 'commit_added' event from the issue data...", commit_hash) elif is_github_noreply_author(event[9], event[10]): # the event is authored by 'GitHub ', but is not a "commit_added" event, so we # neglect this event and remove it now (i.e., not add it to the new issue data any more). - log.warn("Event %s is authored by %s <%s>. Remove this event form the issue data...", - event[8], event[9], event[10]) + log.warning("Event %s is authored by %s <%s>. Remove this event form the issue data...", + event[8], event[9], event[10]) elif (is_github_noreply_author(event[12], event[13][1:-1]) and (event[8] == mentioned_event or event[8] == subscribed_event)): # the event references 'GitHub ', so we neglect this event and remove it now # (i.e., not add it to the new issue data any more). - log.warn("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...", - event[8], event[9], event[10], event[12], event[13]) + log.warning("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...", + event[8], event[9], event[10], event[12], event[13]) else: issue_data_new.append(event) @@ -229,7 +232,7 @@ def is_github_noreply_author(name, email): if not is_github_noreply_author(entry[0], entry[1]): bot_data_new.append(entry) else: - log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1]) + log.warning("Remove entry %s <%s> from bots list.", entry[0], entry[1]) csv_writer.write_to_csv(f, bot_data_new) @@ -285,7 +288,7 @@ def run_postprocessing(conf, resdir, backup_data): return # Check for all files in the result directory of the project whether they need to be adjusted - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Adjust authors lists if authors_list in filenames: @@ -302,7 +305,7 @@ def run_postprocessing(conf, resdir, backup_data): for author in author_data: # keep author entry only if it should not be removed - if not author in author_data_to_remove: + if author not in author_data_to_remove: author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) @@ -469,7 +472,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) __backup_data = args.backup # load configuration diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 53a397e..9b18dd4 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -13,23 +13,25 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Copyright 2021-2022 by Thomas Bock +# Copyright 2026 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract information on bot/human users from csv files. """ import argparse -import httplib import os import sys -import urllib - -import operator -from codeface.cli import log -from codeface.configuration import Configuration +from logging import getLogger +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +# create logger +setup_logging() +log = getLogger(__name__) + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-bots-github', description='Codeface extraction') @@ -39,7 +41,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -75,7 +77,7 @@ def load_bot_data(bot_file, header = True): :return: the read bot data """ - log.devinfo("Read bot data from file '{}'...".format(bot_file)) + log.info("Read bot data from file '{}'...".format(bot_file)) # check if file exists and exit early if not if not os.path.exists(bot_file): @@ -99,7 +101,7 @@ def load_user_data(user_data_file): :return: the read user data """ - log.devinfo("Read user data from file '{}'...".format(user_data_file)) + log.info("Read user data from file '{}'...".format(user_data_file)) # check if file exists and exit early if not if not os.path.exists(user_data_file): @@ -192,12 +194,12 @@ def add_user_data(bot_data, user_data, known_bots_file): continue # get user information if available - if user[0] in user_buffer.keys(): + if user[0] in list(user_buffer.keys()): bot_reduced["user"] = user_buffer[user[0]] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) else: - log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) + log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced) @@ -224,7 +226,7 @@ def print_to_disk(bot_data, results_folder): user["user"]["email"], user["prediction"] ) - if not entry in lines: + if entry not in lines: lines.append(entry) # write to output file diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py index 7cf24ea..88d7069 100644 --- a/codeface_extraction/codeface_extraction.py +++ b/codeface_extraction/codeface_extraction.py @@ -14,7 +14,9 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2016, 2018-2019 by Thomas Bock +# Copyright 2026 by Thomas Bock # Copyright 2018 by Barbara Eckl +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract developer--artifact relations from the Codeface database. @@ -22,15 +24,18 @@ import argparse import sys +from logging import getLogger from os.path import abspath -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager - -import extractions +from . import extractions from csv_writer import csv_writer +from codeface_utils.dbmanager import DBManager +from codeface_utils.configuration import Configuration +from codeface_utils.util import setup_logging +# create logger +setup_logging() +log = getLogger(__name__) ## # RUN FOR ALL PROJECTS @@ -119,7 +124,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) __extract_commit_messages = args.commit_messages __extract_impl = args.implementation __extract_on_range_level = args.range diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index 081a353..9c636dd 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -17,6 +17,7 @@ # Copyright 2019, 2021 by Thomas Bock # Copyright 2018 by Barbara Eckl # Copyright 2018 by Tina Schuh +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file provides the class 'Extraction' and all of its subclasses. @@ -26,18 +27,19 @@ import os import unicodedata import re +from logging import getLogger from ftfy import fix_encoding from email.header import decode_header, make_header -from codeface.cli import log -from codeface.util import gen_range_path +from codeface_utils.util import gen_range_path +log = getLogger(__name__) + # # GET EXTRACTIONS # - def get_extractions(dbm, conf, resdir, csv_writer, extract_commit_messages, extract_impl, extract_on_range_level): # all extractions are subclasses of Extraction: # instantiate them all! @@ -117,7 +119,7 @@ def __init__(self, dbm, conf, res_dir, csv_writer): def is_project_level(self): """Check if this extraction is on project level (i.e., {revision} is not on the SQL statement).""" - return not ("{revision}" in self.sql) + return "{revision}" not in self.sql def is_generic_extraction(self): """Check if this extraction is generic (i.e., it can be used for several artifacts and, hence, @@ -441,7 +443,7 @@ def __init__(self, dbm, conf, resdir, csv_writer): def get_list(self): result = self._run_sql(None, None) lines = self._reduce_result(result) - return [rev for (rev, date) in lines] + return [rev for (rev, _) in lines] # @@ -723,7 +725,7 @@ def _reduce_result(self, result): def fix_characters_in_string(text): """ - Removes control characters such as \r\n \x1b \ufffd from string impl and returns a unicode + Removes control characters such as \r\n \x1b \\ufffd from string impl and returns a unicode string where all control characters have been replaced by a space. :param text: expects a unicode string :return: unicode string @@ -737,12 +739,12 @@ def fix_characters_in_string(text): new_text = fix_encoding(text) # remove unicode characters from "Specials" block - # see: https://www.compart.com/en/unicode/block/U+FFF0 - new_text = re.sub(r"\\ufff.", " ", new_text.encode("unicode-escape")) + # see: https://www.compart.com/en/unicode/block/U+FFF0 + new_text = re.sub(r"\\ufff.", " ", new_text).encode("unicode-escape") # remove all kinds of control characters and emojis # see: https://www.fileformat.info/info/unicode/category/index.htm - new_text = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape")) + new_text = "".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape")) return new_text @@ -765,12 +767,11 @@ def fix_name_encoding(name): try: # Apply correct encoding and return unicode string - return unicode(make_header(info)) + return str(make_header(info)) except UnicodeDecodeError: # Undo utf-8 encoding and return unicode string - return unicode(name.decode('utf-8')) + return str(name.decode('utf-8')) except LookupError: # Encoding not found, return string as is return name - return name diff --git a/codeface_utils/__init__.py b/codeface_utils/__init__.py new file mode 100644 index 0000000..9bad579 --- /dev/null +++ b/codeface_utils/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py new file mode 100644 index 0000000..43a4be5 --- /dev/null +++ b/codeface_utils/cluster/idManager.py @@ -0,0 +1,305 @@ +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2010, 2011 by Wolfgang Mauerer +# Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/cluster/idManager.py +# We inherit the 'idManager' and 'dbIdManager' classes from codeface. +# The 'csvManager' class is original. + +from __future__ import absolute_import +import re +from email.utils import parseaddr +from logging import getLogger +import http.client as http_client +import urllib.parse as urlparse +import json +import string +import random +import time +from abc import ABC, abstractmethod +import pandas + +from ..util import encode_as_utf8 + + +log = getLogger(__name__) + +class idManager(ABC): + + def __init__(self): + # Cache identical requests to the server + self._cache = {} + + self.fixup_emailPattern = re.compile(r'([^<]+)\s+<([^>]+)>') + self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)') + + @abstractmethod + def _query_user_id(self, name, email): + pass + + @abstractmethod + def getPersonFromDB(self, person_id): + pass + + def getPersonID(self, addr): + """Obtain a unique ID from contributor identity credentials.""" + + (name, email) = self._decompose_addr(addr) + if (name, email) not in self._cache: + self._cache[(name, email)] = self._query_user_id(name, email) + ID = self._cache[(name, email)] + + return ID + + def _cleanName(self, name): + # Remove or replace characters in names that are known + # to cause parsing problems in later stages + name = name.replace('\"', "") + name = name.replace("\'", "") + name = name.strip() + + return name + + def _decompose_addr(self, addr): + addr = addr.replace("[", "").replace("]", "") + (name, email) = parseaddr(addr) + + # Handle cases where the name is unknown from commits that potentially + # predate the era of git, where only an e-mail address was given. + # In such a case, we set the name to the e-mail address. Otherwise, + # all authors with unknown name would be matched to one person. + if (name == "unknown" or name == "unknown (none)" or name == "none"): + name = email + + # The eMail parser cannot handle Surname, Name properly. + # Provide a fixup hack for this case + if (name == "" or email.count("@") == 0): + m = re.search(self.fixup_emailPattern, addr) + if m: + name = m.group(1) + email = m.group(2) + m2 = re.search(self.commaNamePattern, name) + if m2: + # Replace "Surname, Name" by "Name Surname" + name = "{0} {1}".format(m2.group(2), m2.group(1)) + + # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email) + else: + # check for the following special format: email@domain.tld <> + strangePattern = re.compile(r'(.*@.*)\s+(<>)') + m3 = re.search(strangePattern, addr) + if m3: + # Replace addr by "email " + name = m3.group(1).split("@")[0] # get name before @ symbol + email = m3.group(1) + # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email) + else: + # In this case, no eMail address was specified. + # print("Fixup for email required, but FAILED for {0}".format(addr)) + name = addr + rand_str = "".join(random.choice(string.ascii_lowercase + string.digits) + for _ in range(10)) + email = "could.not.resolve@" + rand_str + + email = email.lower() + + name = self._cleanName(name) + email = self._cleanName(email) + + return (name, email) + + +class dbIdManager(idManager): + """Provide unique IDs for developers. + + This class provides an interface to the REST id server. Heuristics to + detect developers who operate under multiple identities are included + in the server.""" + + def __init__(self, dbm, conf): + super().__init__() + + self._idMgrServer = conf["idServiceHostname"] + self._idMgrPort = conf["idServicePort"] + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + + # Create a project ID + self._dbm = dbm + # TODO: Pass the analysis method to idManager via the configuration + # file. However, the method should not influence the id scheme so + # that the results are easily comparable. + self._projectID = self._dbm.getProjectID(conf["project"], + conf["tagging"]) + + # Construct request headers + self.headers = {"Content-type": + "application/x-www-form-urlencoded; charset=utf-8", + "Accept": "text/plain"} + + def _query_user_id(self, name, email): + """Query the ID database for a contributor ID""" + + name = encode_as_utf8(name) + params = urlparse.urlencode({'projectID': self._projectID, + 'name': name, + 'email': email}) + + try: + self._conn.request("POST", "/post_user_id", params, self.headers) + res = self._conn.getresponse() + except: + retryCount = 0 + successful = False + while (retryCount <= 10 and not successful): + log.warning("Could not reach ID service. Try to reconnect " \ + "(attempt {}).".format(retryCount)) + self._conn.close() + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + time.sleep(60) + #self._conn.ping(True) + try: + self._conn.request("POST", "/post_user_id", params, self.headers) + res = self._conn.getresponse() + successful = True + except: + if retryCount < 10: + retryCount += 1 + else: + retryCount += 1 + log.exception("Could not reach ID service. Is the server running?\n") + raise + + # TODO: We should handle errors by throwing an exception instead + # of silently ignoring them + result = res.read() + jsond = json.loads(result) + try: + id = jsond["id"] + except KeyError: + raise Exception("Bad response from server: '{}'".format(jsond)) + + return (id) + + def getPersonFromDB(self, person_id): + """Query the ID database for a contributor and all corresponding data""" + + try: + self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers) + res = self._conn.getresponse() + except: + self._conn.close() + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + retryCount = 0 + successful = False + while (retryCount <= 10 and not successful): + log.warning("Could not reach ID service. Try to reconnect " \ + "(attempt {}).".format(retryCount)) + self._conn.close() + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + time.sleep(60) + #self._conn.ping(True) + try: + self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers) + res = self._conn.getresponse() + successful = True + except: + if retryCount < 10: + retryCount += 1 + else: + retryCount += 1 + log.exception("Could not reach ID service. Is the server running?\n") + raise + + result = res.read() + jsond = json.loads(result)[0] + + return (jsond) + + +class csvIdManager(idManager): + """Provide unique IDs for developers. + + This class provides an interface to CSV id files. + """ + def __init__(self, conf): + super().__init__() + + # CSV file containing the IDs + self.csv_file = conf["csvFile"] + self.csv_sep = conf["csvSeparator"] + self.df = self._verifyCsvFile() + + def _verifyCsvFile(self): + with open(self.csv_file, "r") as file: + df = pandas.read_csv(file, sep=self.csv_sep, names=['ID', 'name', 'email']) + return df + + def _addRow(self, name, email): + + # determine next ID + max_id = self.df['ID'].max() + next_id = 0 if bool(pandas.isna(max_id)) else int(max_id) + 1 + + # append new row + self.df = self.df._append({ + 'ID': next_id, + 'name': name, + 'email': email + }, ignore_index=True) + + # dump df to file + file = open(self.csv_file, "w") + self.df.to_csv(file, sep=self.csv_sep, index=False, header=False) + + return next_id + + def _query_user_id(self, name, email): + """Query the ID csv file for a contributor ID""" + + # no name is okay, but no email is not + if not email: + return -1 + + # Match by name and email. + # Disregard random string after "could.not.resolve@" in email + # to avoid creating multiple entries for the same person. + if email.startswith("could.not.resolve@"): + rows = self.df[(self.df['name'] == name) & + (self.df['email'].str.startswith("could.not.resolve@"))] + else: + rows = self.df[(self.df['name'] == name) & + (self.df['email'] == email)] + + if len(rows) == 0: + name = '' if not name else name + return self._addRow(name, email) + + elif len(rows) == 1: + return int(rows['ID'].values[0]) + + else: + raise Exception("Constructed author list is in invalid format. Duplicate entries found") + + def getPersonFromDB(self, person_id): + rows = self.df[self.df['ID'] == person_id] + if len(rows) == 1: + return { + 'name': rows['name'].values[0], + 'email1': rows['email'].values[0], + 'id': person_id + } diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py new file mode 100644 index 0000000..e4a654a --- /dev/null +++ b/codeface_utils/configuration.py @@ -0,0 +1,217 @@ +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Johannes Ebke +# Copyright 2025 by Maximilian Löffler +# Copyright 2026 by Thomas Bock +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/configuration.py + +''' +Configuration module for codeface + +Encapsulates a configuration as an immutable dict +''' + +from __future__ import absolute_import +from tempfile import NamedTemporaryFile +from collections.abc import Mapping +from logging import getLogger +import yaml + +from codeface_utils.linktype import LinkType + + +log = getLogger(__name__) + +class ConfigurationError(Exception): + '''Raised if any part of the configuration is malformed''' + pass + +class Configuration(Mapping): + ''' + Encapsulates the codeface configuration + ''' + + GLOBAL_KEYS = ('dbname', 'dbhost', 'dbuser', 'dbpwd', + 'idServiceHostname', 'idServicePort') + GLOBAL_OPTIONAL_KEYS = ('dbport', 'useCsv') + PROJECT_KEYS = ('project', 'repo', 'tagging', 'revisions', 'rcs') + OPTIONAL_KEYS = ('description', 'ml', 'mailinglists', 'sleepTime', + 'proxyHost', 'proxyPort', 'bugsProjectName', + 'productAsProject', 'issueTrackerType', + 'issueTrackerURL', 'issueTrackerProject', + 'issueTrackerUser', 'issueTrackerPassword', + 'understand', 'sloccount', 'windowSize', 'numWindows', + 'qualityType', 'communicationType', 'artifactType', 'dependencyType', + 'csvFile', 'csvSeparator') + ALL_KEYS = set(GLOBAL_KEYS + GLOBAL_OPTIONAL_KEYS + PROJECT_KEYS + + OPTIONAL_KEYS) + + def __init__(self): + ''' + Initialize an empty configuration object with the default values + ''' + self._conf = { + 'idServiceHostname' : '127.0.0.1', + 'idServicePort' : 8080 + } + + self._conf_file_loc = None + + @classmethod + def load(cls, global_conffile, local_conffile=None): + ''' + Load configuration from global/local files + ''' + c = Configuration() + log.info("Loading global configuration file '{}'". + format(global_conffile)) + cls._global_conf = c._load(global_conffile) + c._conf.update(c._global_conf) + if local_conffile: + log.info("Loading project configuration file '{}'". + format(local_conffile)) + cls._project_conf = c._load(local_conffile) + c._conf.update(c._project_conf) + else: + log.info("Not loading project configuration file!") + c._initialize() + c._check_sanity() + return c + + def _load(self, filename): + '''Helper function that checks loading errors and logs them''' + try: + return yaml.load(open(filename, 'r'), Loader=yaml.SafeLoader) + except IOError: + log.exception("Could not open configuration file '{}'". + format(filename)) + raise + except yaml.YAMLError: + log.exception("Could not parse configuration file '{}'". + format(filename)) + raise + + def _initialize(self): + '''Infer missing values in the configuration''' + if "rcs" not in self: + self._conf["rcs"] = [None for _ in range(len(self["revisions"]))] + + if "mailinglists" not in self: + self._conf["mailinglists"] = [] + if "ml" in self: + self._conf["mailinglists"].append({"name": self["ml"]}) + for ml in self._conf["mailinglists"]: + ml.setdefault("type", "dev") + ml.setdefault("source", "gmane") + + if "dbport" not in self: + self._conf["dbport"] = 3306 + else: + self._conf["dbport"] = int(self._conf["dbport"]) + + if "useCsv" not in self: + self._conf["useCsv"] = False + + def _check_sanity(self): + ''' + Check that the configuration makes sense. + :raise ConfigurationError + ''' + + # Some elementary sanity checks + for key in self.GLOBAL_KEYS: + if self._project_conf and key in self._project_conf: + log.critical("The key '{}' may not be overridden in the " + "project configuration file".format(key)) + raise ConfigurationError('Invalid configuration key.') + + for key in self.GLOBAL_KEYS + self.PROJECT_KEYS: + if key not in self: + log.critical("Required key '{}' missing in configuration!" + ''.format(key)) + raise ConfigurationError('Missing configuration key.') + + if self['tagging'] not in LinkType.get_all_link_types(): + log.critical('Unsupported tagging mechanism specified!') + raise ConfigurationError('Unsupported tagging mechanism.') + + if len(self["revisions"]) < 2: + log.info("No revision range specified in configuration, using auto-generated windows") + + if len(self["revisions"]) != len(self["rcs"]): + log.critical("Malformed configuration: revision and rcs list " + "lengths differ! Found {0} revisions and {1} release " + "candidates.".format(len(self["revisions"]), len(self["rcs"]))) + raise ConfigurationError('Malformed configuration.') + + if self["useCsv"]: + if "csvFile" not in self: + log.critical("Malformed configuration: useCsv is true, but " + "csvFile is not specified.") + raise ConfigurationError('Malformed configuration.') + if "csvSeparator" not in self: + self["csvSeparator"] = "," + + unknown_keys = [k for k in self if k not in self.ALL_KEYS] + for key in unknown_keys: + log.warning("Unknown key '{}' in configuration.".format(key)) + + def write(self): + conf_file = NamedTemporaryFile(mode='w', prefix=self._conf['project'], + delete=False) + yaml.dump(self._conf, conf_file) + self._conf_file_loc = conf_file.name + conf_file.close() + + def get_conf_file_loc(self): + return self._conf_file_loc + + # Function for the Configuration object to function as a dict + def __getitem__(self, key): + return self._conf[key] + + def __setitem__(self, key, value): + self._conf[key] = value + + def __len__(self): + return len(self._conf) + + def __iter__(self): + return iter(self._conf) + + def __keys__(self): + return list(self._conf.keys()) + + def __str__(self): + ''' + Return a pretty string for display and logging + ''' + r = [] + r.append("--- # global codeface configuration") + for key in self.GLOBAL_KEYS: + if key in self: + r.append("{}: {}".format(key, repr(self[key]))) + r.append("# codeface project configuration") + for key in self.PROJECT_KEYS + self.OPTIONAL_KEYS: + if key in self: + r.append("{}: {}".format(key, repr(self[key]))) + unknown = [k for k in self if k not in self.ALL_KEYS] + if unknown: + r.append("# Unknown keys") + for key in unknown: + r.append("{}: {}".format(key, repr(self[key]))) + return "\n".join(r) diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py new file mode 100644 index 0000000..aecc172 --- /dev/null +++ b/codeface_utils/dbmanager.py @@ -0,0 +1,481 @@ +#! /usr/bin/env python +# This file is part of Codeface. Codeface is free software: you can +# redistribute it and/or modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler +# All Rights Reserved. + +# Thin sql database wrapper + +from __future__ import absolute_import +from __future__ import print_function +import MySQLdb as mdb +import time +from datetime import datetime, timezone +from logging import getLogger +from contextlib import contextmanager + + +# create logger +log = getLogger(__name__) + +@contextmanager +def _log_db_error(action, args=None): + try: + yield + except mdb.Error as e: + if args: + try: + action = action % args + except: + pass + log.critical('MySQL error {e[0]} during "{action}": {e[1]}' + ''.format(e=e.args, action=action)) + raise + + +class DBManager: + """This class provides an interface to the codeface sql database.""" + + def __init__(self, conf): + + self.conf = conf + self.__openConnection(conf) + + # max_packet_size = 1024 * 1024 * 512 + # self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) + + def __del__(self): + if self.con is not None: + self.con.close() + + def __openConnection(self, conf): + try: + self.con = None + self.con = mdb.Connection(host=conf["dbhost"], + port=conf["dbport"], + user=conf["dbuser"], + passwd=conf["dbpwd"], + db=conf["dbname"], + charset="utf8", + use_unicode=True) + self.cur = self.con.cursor() + log.debug( + "Establishing MySQL connection to " + "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'" + .format(c=conf)) + except mdb.Error as e: + log.critical( + "Failed to establish MySQL connection to " + "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'" + ": {e[1]} ({e[0]})" + "".format(c=conf, e=e.args)) + raise + + + def doExec(self, stmt, args=None): + with _log_db_error(stmt, args): + retryCount = 0 + while retryCount < 10: + try: + if isinstance(args, list): + res = self.cur.executemany(stmt, args) + else: + res = self.cur.execute(stmt, args) + return res + except mdb.OperationalError as dbe: + retryCount += 1 + log.info("DBE args: " + str(dbe.args)) + if dbe.args[0] == 1213: # Deadlock! retry... + log.warning("Recoverable deadlock in MySQL - retrying " \ + "(attempt {}).".format(retryCount)) + elif dbe.args[0] == 2003: # Can't connect to MySQL server + log.warning("Can't connect to MySQL server - retrying " \ + "(attempt {}).".format(retryCount)) + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") + elif dbe.args[0] == 2006: # Server gone away... + log.warning("MySQL Server gone away, trying to reconnect " \ + "(attempt {}).".format(retryCount)) + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") + elif dbe.args[0] == 2013 or dbe.args[0] == 1053: # Lost connection to MySQL server during query | Server shutdown in progress + log.warning("Lost connection to MySQL server during query, " \ + "trying to reconnect (attempt {}).".format(retryCount)) + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") + elif dbe.args[0] == 1153: # Got a packet bigger than 'max_allowed_packet' bytes + log.warning("Sent a too big packet ({lnos} lines), retrying with smaller packets.".format( + lnos=len(args))) + ## split package into smaller packets of size 'chunk_size' + chunk_size = 100 + args_list = [args[i:i + chunk_size] for i in range(0, len(args), chunk_size)] + ## retrying + time.sleep(60) + self.con.ping(True) + for chunk in args_list: + self.doExec(stmt, chunk) + else: + self.con.close() + raise + + # Give up after too many retry attempts and propagate the + # problem to the caller. Either it's fixed with a different + # query, or the analysis fails + log.error("DB access failed after ten attempts, giving up") + self.con.close() + raise + + def doFetchAll(self): + with _log_db_error("fetchall"): + return self.cur.fetchall() + + def doCommit(self): + with _log_db_error("commit"): + return self.con.commit() + + def doExecCommit(self, stmt, args=None): + self.doExec(stmt, args) + self.doCommit() + + # NOTE: We don't provide any synchronisation since by assumption, + # a single project is never analysed from two threads. + def getProjectID(self, name, analysisMethod): + """ + Return the project ID of the given name/analysisMethod combination. + If the project does not exist yet in the database, it is created. + """ + self.doExec("SELECT id FROM project WHERE name=%s " + "AND analysisMethod=%s", (name, analysisMethod)) + if self.cur.rowcount == 0: + # Project is not contained in the database + log.info("Creating new project {}/{}". + format(name, analysisMethod)) + self.doExecCommit("INSERT INTO project (name, analysisMethod) " + + "VALUES (%s, %s);", (name, analysisMethod)) + self.doExec("SELECT id FROM project WHERE name=%s;", (name,)) + elif self.cur.rowcount > 1: + raise Exception("Duplicate projects {}/{} in database!". + format(name, analysisMethod)) + pid = self.doFetchAll()[0][0] + log.info("Using project {}/{} with ID {}". + format(name, analysisMethod, pid)) + return pid + + def get_project(self, pid): + self.doExec("SELECT name, analysisMethod FROM project" + " WHERE id=%s", pid) + if self.cur.rowcount == 0: + raise Exception("Project id {} not found!".format(pid)) + return self.doFetchAll()[0] + + def get_edgelist(self, cid): + self.doExec("SELECT fromId, toId, weight FROM edgelist \ + WHERE clusterId={}".format(cid)) + if self.cur.rowcount == 0: + raise Exception("Cluster id {} not found!".format(cid)) + return self.doFetchAll() + + def get_file_dev(self, project_id, range_id): + self.doExec("SELECT * FROM (SELECT id, commitHash, commitDate, author, description " \ + "FROM commit WHERE projectId={} AND releaseRangeId={}) AS Commits " \ + "INNER JOIN (SELECT file, commitId, SUM(size) AS fileSize " \ + "FROM commit_dependency GROUP BY commitId, file) AS commitFileLOC " \ + "ON Commits.id=commitFileLOC.commitId ORDER BY " \ + "commitFileLOC.file, commitFileLOC.commitId".format(project_id, range_id)) + + if self.cur.rowcount == 0: + raise Exception("Could not obtain file-dev information for project {} "\ + "(release range {}!".format(project_id, range_id)) + return self.doFetchAll() + + def get_release_ranges(self, project_id): + self.doExec("SELECT id FROM release_range \ + WHERE projectId={}".format(project_id)) + if self.cur.rowcount == 0: + raise Exception("No release ranges found for project {}!" + .format(project_id)) + return [range_entry[0] for range_entry in self.doFetchAll()] + + def get_cluster_id(self, pid, release_range_id=None): + if release_range_id: + self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \ + AND projectId={} AND releaseRangeId={}" + .format(pid, release_range_id)) + else: + self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \ + AND projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("Cluster from project {} not found!".format(pid)) + return self.doFetchAll()[0][0] + + def get_project_persons(self, pid): + self.doExec("SELECT id, name FROM person \ + WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("Persons from project {} not found!".format(pid)) + return (self.doFetchAll()) + + def getTagID(self, projectID, tag, type): + """Determine the ID of a tag, given its textual form and the type""" + self.doExec("SELECT id FROM release_timeline WHERE projectId=%s " + + "AND tag=%s AND type=%s", (projectID, tag, type)) + if self.cur.rowcount != 1: + raise Exception("Tag '{}' of type {} is {} times in the DB!". + format(tag, type, self.cur.rowcount)) + return self.doFetchAll()[0][0] + + def getCommitId(self, projectId, commitHash, releaseRangeID=None): + stmt = "SELECT id FROM commit WHERE commitHash=%s AND projectId=%s" + args = (commitHash, projectId) + + if (releaseRangeID): + stmt += " AND releaseRangeId=%s" + args += (releaseRangeID, ) + + self.doExec(stmt, args) + if self.cur.rowcount == 0: + raise Exception("Commit {0} from project {1} not found!". + format(commitHash, projectId)) + return self.doFetchAll()[0][0] + + def getRevisionID(self, projectID, tag): + return self.getTagID(projectID, tag, "release") + + def getRCID(self, projectID, tag): + return self.getTagID(projectID, tag, "rc") + + def getReleaseRangeID(self, projectID, revisionIDs): + """Given a pair of release IDs, determine the release range ID""" + self.doExec("SELECT id FROM release_range WHERE projectId=%s " + + "AND releaseStartId=%s AND releaseEndId=%s", + (projectID, revisionIDs[0], revisionIDs[1])) + if self.cur.rowcount != 1: + raise Exception("Release range from '{r[0]}' to '{r[1]}' is {c} " + "times in the DB!". + format(r=revisionIDs, c=self.cur.rowcount)) + return self.doFetchAll()[0][0] + + def getProjectTimeRange(self, pid): + """Given a project ID, determine the start and end date of available VCS data. + Returns a tuple with start end end date in the form YYYY-MM-DD""" + self.doExec("SELECT MIN(date_start) FROM revisions_view " + "WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("No start date for pid {} found!".format(pid)) + date_start = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + self.doExec("SELECT MAX(date_end) FROM revisions_view " + "WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("No end date for pid {} found!".format(pid)) + date_end = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + return (date_start, date_end) + + def get_commit_cdate(self, pid, hash): + """Given a project ID and a commit hash, obtain the commit date + in format YYYY-MM-DD""" + self.doExec("SELECT commitDate FROM commit " + "WHERE projectId={} and commitHash='{}'".format(pid, hash)) + if self.cur.rowcount == 0: + raise Exception("No date found for commit {} (pid {}) found!".format(hash, pid)) + date = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + return (date) + + def get_release_range(self, project_id, range_id): + self.doExec( + "SELECT st.tag, nd.tag, rc.tag FROM release_range " + "LEFT JOIN release_timeline AS st ON st.id=releaseStartId " + "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId " + "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId " + "WHERE release_range.projectId=%s AND release_range.id=%s", + (project_id, range_id)) + ranges = self.doFetchAll() + if self.cur.rowcount == 0: + raise Exception("Range id {} not found!".format(project_id)) + return ranges[0] + + def get_num_commits_in_range(self, range_id): + self.doExec("SELECT COUNT(*) FROM commit WHERE releaseRangeId={}".format(range_id)) + if self.cur.rowcount == 0: + raise Exception("Range id {} not found in get_num_commits_in_range!".format(range_id)) + return self.doFetchAll()[0][0] + + def update_release_timeline(self, project, tagging, revs, rcs, + recreate_project=False): + ''' + For a project, update the release timeline table with the given + revisions. If existing releases/rcs from the timeline are not in + order, the conservative approach is taken and the whole project is + recreated to avoid inconsistencies. + + Returns true if the project had to be recreated. + ''' + assert len(revs) >= 2 + assert len(revs) == len(rcs) + rcs = [rc if rc else rev for rc, rev in zip(rcs, revs)] + pid = self.getProjectID(project, tagging) + + if not recreate_project: + # First check if the release timeline is sane and in order + self.doExec("SELECT tag FROM release_timeline WHERE projectId=%s " + "AND type='release' ORDER BY id", (pid,)) + tags = [tag for (tag,) in self.doFetchAll()] + if len(set(tags)) != len(tags): + log.error("Database corrupted: Duplicate release entries in " + "release_timeline! Recreating project.") + recreate_project = True + if len(tags) == 0: + recreate_project = True + + # Check that the tags are in the same order + if not recreate_project: + for i, tag in enumerate(tags): + if i >= len(revs): + log.warning("List of revisions to analyse was shortened.") + break + if revs[i] != tag: + log.error("Release number {} changed tag from {} to " + "{}. Recreating project.". + format(i, tag, revs[i])) + recreate_project = True + break + + # Check that the RC tags are in order + if not recreate_project: + self.doExec("SELECT tag FROM release_timeline WHERE " + "projectId=%s AND type='rc' ORDER BY id", (pid,)) + rctags = [tag for (tag,) in self.doFetchAll()] + if len(set(rctags)) != len(rctags): + log.error("Database corrupted: Duplicate RC entries in " + "release_timeline! Recreating project.") + recreate_project = True + + # Check for changes in release candidates + # Note that the first RC is unused, since it refers to the end + # of a previous period + if not recreate_project: + for i, tag in enumerate(rctags): + if i + 1 >= len(rcs): + log.warning("List of release candidates to analyse " + "was shortened.") + break + if rcs[i + 1] != tag: + log.error("Release candidate number {} changed tag " + "from {} to {}. Recreating project.". + format(i, tag, rcs[i + 1])) + recreate_project = True + break + + # Go through the release ranges and check if they have changed + if not recreate_project: + self.doExec( + "SELECT st.tag, nd.tag, rc.tag FROM release_range " + "LEFT JOIN release_timeline AS st ON st.id=releaseStartId " + "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId " + "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId " + "WHERE release_range.projectId=%s ORDER BY release_range.id", + (pid,)) + ranges = self.doFetchAll() + if len(set(ranges)) != len(tags) - 1: + log.error("Database corrupted: Number of release ranges" + " does not match number of release tags!") + recreate_project = True + + for i, (start, end, rc) in enumerate(self.doFetchAll()): + if i + 1 >= len(revs) or recreate_project: + # List of revisions to analyse was shortened + break + if (start, end) != (revs[i], revs[i + 1]): + log.error("Release range {} changed from {} to {}." + " Recreating project.". + format(i, (start, end), (revs[i], revs[i + 1]))) + recreate_project = True + break + if rc != rcs[i + 1]: + log.error("Release candidate {} changed from {} to {}." + " Recreating project.". + format(i, rc, rcs[i + 1])) + recreate_project = True + break + + # Recreate project if necessary + if recreate_project: + # This should ripple through the database and delete + # all referencing entries for project + log.warning("Deleting and re-creating project {}/{}.". + format(project, tagging)) + self.doExecCommit("DELETE FROM `project` WHERE id=%s", (pid,)) + pid = self.getProjectID(project, tagging) + tags = [] + rctags = [] + + # at this point we have verified that the first len(tags) + # entries are identical + new_ranges_to_process = [] + if len(revs) > len(tags): + n_new = len(revs) - len(tags) + log.info("Adding {} new releases...".format(n_new)) + previous_rev = None + if len(tags) > 0: + previous_rev = tags[-1] + for rev, rc in zip(revs, rcs)[len(tags):]: + self.doExecCommit("INSERT INTO release_timeline " + "(type, tag, projectId) " + "VALUES (%s, %s, %s)", + ("release", rev, pid)) + + if previous_rev is not None and rc: + self.doExecCommit("INSERT INTO release_timeline " + "(type, tag, projectId) " + "VALUES (%s, %s, %s)", + ("rc", rc, pid)) + + if previous_rev is not None: + startID = self.getRevisionID(pid, previous_rev) + endID = self.getRevisionID(pid, rev) + if rc: + rcID = self.getRCID(pid, rc) + else: + rcID = "NULL" + self.doExecCommit("INSERT INTO release_range " + "(releaseStartId, releaseEndId, " + "projectId, releaseRCStartId) " + "VALUES (%s, %s, %s, %s)", + (startID, endID, pid, rcID)) + new_ranges_to_process.append(self.getReleaseRangeID(pid, + (startID, endID))) + previous_rev = rev + # now we are in a well-defined state. + # Return the ids of the release ranges we have to process + return new_ranges_to_process + + +def tstamp_to_sql(tstamp): + """Convert a Unix timestamp into an SQL compatible DateTime string""" + return datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py new file mode 100644 index 0000000..617d11f --- /dev/null +++ b/codeface_utils/linktype.py @@ -0,0 +1,42 @@ +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2014 by Matthias Dittrich +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/linktype.py + + +#enum-like class to distinguish between the various +#methods used to link individuals +class LinkType: + tag = "tag" + proximity = "proximity" + committer2author = "committer2author" + file = "file" + feature = "feature" + feature_file = "feature_file" + + _all_link_types = \ + (tag, proximity, committer2author, file, feature, feature_file) + + @staticmethod + def get_all_link_types(): + return LinkType._all_link_types + + @staticmethod + def get_tag_types(): + return ["Signed-off-by", "Acked-by", "CC", "Reviewed-by", + "Reported-by", "Tested-by", "Patch"] diff --git a/codeface_utils/util.py b/codeface_utils/util.py new file mode 100644 index 0000000..59402d8 --- /dev/null +++ b/codeface_utils/util.py @@ -0,0 +1,111 @@ +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler +# Copyright 2026 by Thomas Bock +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/util.py + +from __future__ import absolute_import +import logging +import os +import os.path +import re +import sys +import traceback +import unicodedata +from threading import enumerate as threading_enumerate +from ftfy import fix_encoding + +def setup_logging(level=logging.INFO): + logging.basicConfig( + level=level, + format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' + ) + +log = logging.getLogger(__name__) + +# Function to dump the stacks of all threads +def get_stack_dump(): + id2name = dict([(th.ident, th.name) for th in threading_enumerate()]) + code = ["Stack dump:"] + for threadId, stack in sys._current_frames().items(): + code.append("") + code.append("# Thread: %s(%d)" % (id2name.get(threadId,""), threadId)) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) + return code + +def gen_range_path(base_path, i, start_rev, end_rev): + if (len(start_rev) == 40): + # Same logic as above, but construct a file system path + start_rev = start_rev[0:6] + end_rev = end_rev[0:6] + return(os.path.join(base_path, "{0}--{1}-{2}". + format(str(i).zfill(3), start_rev, end_rev))) + +def encode_as_utf8(string): + """ + Encode the given string properly in UTF-8, + independent from its internal representation (str or unicode). + + This function removes any control characters and four-byte-encoded unicode characters and replaces them + with " ". (Four-byte-encoded unicode characters do not work with 'utf8' encoding of MySQL.) + + :param string: any string + :return: the UTF-8 encoded string of type str + """ + + # Normalize to str first + if isinstance(string, bytes): + try: + text = string.decode("utf-8") + except UnicodeDecodeError: + text = string.decode("utf-8", errors="replace") + elif isinstance(string, str): + text = string + else: + # not string-like, return as-is + return string + + # convert to real unicode-utf8 encoded string, fix_text ensures proper encoding + new_string = fix_encoding(text) + + # remove unicode characters from "Specials" block + # see: https://www.compart.com/en/unicode/block/U+FFF0 + new_string = re.sub(r"[\ufff0-\uffff]", " ", new_string) + + # remove all kinds of control characters and emojis + # see: https://www.fileformat.info/info/unicode/category/index.htm + new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string) + + new_string = new_string.encode("utf-8") + + # replace any 4-byte characters with a single space (previously: four_byte_replacement) + try: + # UCS-4 build + four_byte_regex = re.compile(u"[\U00010000-\U0010ffff]") + except re.error: + # UCS-2 build + four_byte_regex = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]") + + four_byte_replacement = r" " # r":4bytereplacement:" + new_string = four_byte_regex.sub(four_byte_replacement, new_string.decode("utf-8")).encode("utf-8") + + return new_string.decode("utf-8") + diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py index 2804081..ca453be 100644 --- a/csv_writer/csv_writer.py +++ b/csv_writer/csv_writer.py @@ -15,6 +15,7 @@ # Copyright 2017 by Claus Hunsen # Copyright 2018 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file provides the needed functions for standardized CSV writing @@ -23,19 +24,6 @@ import csv -def __encode(line): - """Encode the given line (a tuple of columns) properly in UTF-8.""" - - lineres = () # re-encode column if it is unicode - for column in line: - if type(column) is unicode: - lineres += (column.encode("utf-8"),) - else: - lineres += (column,) - - return lineres - - def write_to_csv(file_path, lines, append=False): """ Write the given lines to the file with the given file path. @@ -45,14 +33,13 @@ def write_to_csv(file_path, lines, append=False): :param append: Flag if lines shall be appended to file or overwrite file """ - open_mode = "a+b" if append else "wb" + open_mode = "a" if append else "w" - with open(file_path, open_mode) as csv_file: + with open(file_path, mode=open_mode, encoding="utf-8") as csv_file: wr = csv.writer(csv_file, delimiter=';', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) # encode in proper UTF-8 before writing to file for line in lines: - line_encoded = __encode(line) - wr.writerow(line_encoded) + wr.writerow(line) def read_from_csv(file_path, delimiter=";"): """ diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a901e19..3db14d5 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,28 +18,31 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock +# Copyright 2026 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract Github issue data from json files. """ import argparse -import httplib import json import os import sys -import urllib from datetime import datetime, timedelta +from logging import getLogger -import operator -from codeface.cli import log -from codeface.cluster.idManager import idManager -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from codeface_utils.cluster.idManager import dbIdManager, csvIdManager +from codeface_utils.configuration import Configuration +from codeface_utils.dbmanager import DBManager from dateutil import parser as dateparser from csv_writer import csv_writer +# create logger +setup_logging() +log = getLogger(__name__) + # known types from JIRA and GitHub default labels known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} @@ -61,7 +64,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -95,7 +98,7 @@ def load(source_folder): """ srcfile = os.path.join(source_folder, "issues.json") - log.devinfo("Loading Github issues from file '{}'...".format(srcfile)) + log.info("Loading Github issues from file '{}'...".format(srcfile)) # check if file exists and exit early if not if not os.path.exists(srcfile): @@ -191,7 +194,7 @@ def lookup_user(user_dict, user): user["email"] is None or user["email"] == ""): # lookup user only if username is not None and not empty - if not user["username"] is None and not user["username"] == "": + if user["username"] is not None and not user["username"] == "": user = user_dict[user["username"]] return user @@ -210,8 +213,8 @@ def update_user_dict(user_dict, user): if user is None: user = create_deleted_user() - if not user["username"] in user_dict.keys(): - if not user["username"] is None and not user["username"] == "": + if user["username"] not in list(user_dict.keys()): + if user["username"] is not None and not user["username"] == "": user_dict[user["username"]] = user else: user_in_dict = user_dict[user["username"]] @@ -232,7 +235,7 @@ def reformat_issues(issue_data): :return: the re-arranged issue data """ - log.devinfo("Re-arranging Github issues...") + log.info("Re-arranging Github issues...") # re-process all issues for issue in issue_data: @@ -340,7 +343,7 @@ def merge_issue_events(issue_data): # as we cannot update the referenced issue during iterating over all issues, we need to save the # referenced_by event for the referenced issue temporarily - if rel_issue["number"] in issue_data_to_update.keys(): + if rel_issue["number"] in list(issue_data_to_update.keys()): issue_data_to_update[rel_issue["number"]]["eventsList"].append(referenced_issue_event) else: ref = dict() @@ -422,7 +425,7 @@ def merge_issue_events(issue_data): # add dismissal comments to the list of comments for event in issue["eventsList"]: - if (event["event"] == "review_dismissed" and not event["dismissalMessage"] is None + if (event["event"] == "review_dismissed" and event["dismissalMessage"] is not None and not event["dismissalMessage"] == ""): dismissalComment = dict() dismissalComment["event"] = "commented" @@ -500,7 +503,7 @@ def merge_issue_events(issue_data): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # updates all the issues by the temporarily stored referenced_by events - for key, value in issue_data_to_update.iteritems(): + for _, value in issue_data_to_update.items(): for issue in issue_data: if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] @@ -535,7 +538,7 @@ def reformat_events(issue_data): users = update_user_dict(users, event["user"]) # 3) add or update users which are ref_target of the current event - if not event["ref_target"] is None and not event["ref_target"] == "": + if event["ref_target"] is not None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) # as the user dictionary is created, start re-formating the event information of all issues @@ -636,7 +639,7 @@ def reformat_events(issue_data): event["event_info_1"] = issue["state_new"] event["event_info_2"] = issue["resolution"] - elif event["event"] == "referenced" and not event["commit"] is None: + elif event["event"] == "referenced" and event["commit"] is not None: # remove "referenced" events originating from commits # as they are handled as referenced commit events_to_remove.append(event) @@ -670,10 +673,13 @@ def insert_user_data(issues, conf, resdir): user_id_buffer = dict() # create buffer for usernames (key: username) username_id_buffer = dict() - # open database connection - dbm = DBManager(conf) - # open ID-service connection - idservice = idManager(dbm, conf) + + # connect to ID service + if conf["useCsv"]: + idservice = csvIdManager(conf) + else: + dbm = DBManager(conf) + idservice = dbIdManager(dbm, conf) def get_user_string(name, email): if not email or email is None: @@ -683,26 +689,24 @@ def get_user_string(name, email): return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer): - username = unicode(user["username"]).encode("utf-8") - # fix encoding for name and e-mail address - if user["name"] is not None: - name = unicode(user["name"]).encode("utf-8") - else: - name = username - mail = unicode(user["email"]).encode("utf-8") + # ensure string representation for name and e-mail address + username = str(user["username"]) + name = str(user["name"]) if "name" in user else username + mail = str(user["email"]) + # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: - log.devinfo("Returning person id for user '{}' from buffer.".format(user_string)) + log.info("Returning person id for user '{}' from buffer.".format(user_string)) if username is not None: buffer_usernames[username] = buffer_db_ids[user_string] return buffer_db_ids[user_string] # get person information from ID service - log.devinfo("Passing user '{}' to ID service.".format(user_string)) + log.info("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer @@ -719,16 +723,17 @@ def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: - log.devinfo("Returning user '{}' from buffer.".format(idx)) + log.info("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service - log.devinfo("Passing user id '{}' to ID service.".format(idx)) + log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) - user = dict() - user["email"] = person["email1"] # column "email1" - user["name"] = person["name"] # column "name" - user["id"] = person["id"] # column "id" + user = { + "name": person["name"], + "email": person["email1"], + "id": person["id"] + } # add user information to buffer buffer_db[idx] = user diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index d9748ae..4220b96 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -17,7 +17,8 @@ # Copyright 2018 by Barbara Eckl # Copyright 2018-2019 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock -# Copyright 2023 by Maximilian Löffler +# Copyright 2026 by Thomas Bock +# Copyright 2023, 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -26,27 +27,29 @@ import argparse import os import sys -import time import csv import json +from logging import getLogger from xml.dom.minidom import parse -from datetime import datetime from dateutil import parser as dateparser -from codeface.cli import log -from codeface.cluster.idManager import idManager -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from codeface_utils.cluster.idManager import dbIdManager, csvIdManager +from codeface_utils.configuration import Configuration +from codeface_utils.dbmanager import DBManager from csv_writer import csv_writer from jira import JIRA from jira.exceptions import JIRAError from time import sleep +import importlib -reload(sys) -sys.setdefaultencoding("utf-8") +importlib.reload(sys) + +# create logger +setup_logging() +log = getLogger(__name__) # global counter for JIRA requests to make sure to not exceed the request limit jira_request_counter = 0 @@ -65,7 +68,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -114,9 +117,9 @@ def run(): processed_issues.extend(issues) # 4) insert referenced_by events into issue histories - for issue_id in referenced_bys.keys(): + for issue_id in list(referenced_bys.keys()): # obtain list of issues which have the current issue id - referenced_issue = list(filter(lambda issue: issue["externalId"] == issue_id, processed_issues)) + referenced_issue = list([issue for issue in processed_issues if issue["externalId"] == issue_id]) if len(referenced_issue) > 0: if len(referenced_issue) > 1: log.warning("Ambiguous issue id " + issue_id + " found in the issue list.") @@ -172,7 +175,7 @@ def load_xml(source_folder, xml_file): """ srcfile = os.path.join(source_folder, xml_file) - log.devinfo("Loading issues from file '{}'...".format(srcfile)) + log.info("Loading issues from file '{}'...".format(srcfile)) try: # parse the xml-file @@ -235,21 +238,21 @@ def merge_user_with_user_from_csv(user, persons): """ new_user = dict() - name_utf8 = unicode(user["name"]).encode("utf-8") - username_utf8 = unicode(user["username"].lower()).encode("utf-8") + name_utf8 = str(user["name"]).encode("utf-8") + username_utf8 = str(user["username"].lower()).encode("utf-8") - if username_utf8 in persons["by_username"].keys(): + if username_utf8 in list(persons["by_username"].keys()): new_user["username"] = username_utf8 - new_user["name"] = unicode(persons["by_username"].get(username_utf8)[0]).encode("utf-8") - new_user["email"] = unicode(persons["by_username"].get(username_utf8)[1]).encode("utf-8") - elif name_utf8 in persons["by_name"].keys(): + new_user["name"] = str(persons["by_username"].get(username_utf8)[0]).encode("utf-8") + new_user["email"] = str(persons["by_username"].get(username_utf8)[1]).encode("utf-8") + elif name_utf8 in list(persons["by_name"].keys()): new_user["username"] = username_utf8 - new_user["name"] = unicode(persons["by_name"].get(name_utf8)[0]).encode("utf-8") - new_user["email"] = unicode(persons["by_name"].get(name_utf8)[1]).encode("utf-8") + new_user["name"] = str(persons["by_name"].get(name_utf8)[0]).encode("utf-8") + new_user["email"] = str(persons["by_name"].get(name_utf8)[1]).encode("utf-8") else: new_user["username"] = username_utf8 new_user["name"] = name_utf8 - new_user["email"] = unicode(user["email"]).encode("utf-8") + new_user["email"] = str(user["email"]).encode("utf-8") log.warning("User not in csv-file: " + str(user)) log.info("current User: " + str(user) + ", new user: " + str(new_user)) @@ -290,7 +293,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): resolved = issue_x.getElementsByTagName("resolved") issue["resolveDate"] = "" - if (len(resolved) > 0) and (not resolved[0] is None): + if (len(resolved) > 0) and (resolved[0] is not None): resolveDate = resolved[0].firstChild.data issue["resolveDate"] = format_time(resolveDate) @@ -372,7 +375,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): text = comment_x.firstChild if text is None: - log.warn("Empty comment in issue " + issue["id"]) + log.warning("Empty comment in issue " + issue["id"]) comment["text"] = "" else: comment["text"] = text.data @@ -440,7 +443,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): api_issue = jira_project.issue(issue["externalId"], expand="changelog") changelog = api_issue.changelog except JIRAError: - log.warn("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!") + log.warning("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!") changelog = None histories = list() @@ -478,7 +481,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: - log.warn("No author for history: " + str(change.id) + " created at " + str(change.created)) + log.warning("No author for history: " + str(change.id) + " created at " + str(change.created)) user = create_user("","","") history["author"] = merge_user_with_user_from_csv(user, persons) history["date"] = format_time(change.created) @@ -498,7 +501,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: - log.warn("No author for history: " + str(change.id) + " created at " + str(change.created)) + log.warning("No author for history: " + str(change.id) + " created at " + str(change.created)) user = create_user("","","") history["author"] = merge_user_with_user_from_csv(user, persons) history["date"] = format_time(change.created) @@ -590,10 +593,13 @@ def insert_user_data(issues, conf): user_buffer = dict() # create buffer for user ids (key: user string) user_id_buffer = dict() - # open database connection - dbm = DBManager(conf) - # open ID-service connection - idservice = idManager(dbm, conf) + + # connect to ID service + if conf["useCsv"]: + idservice = csvIdManager(conf) + else: + dbm = DBManager(conf) + idservice = dbIdManager(dbm, conf) def get_user_string(name, email): if not email or email is None: @@ -603,22 +609,21 @@ def get_user_string(name, email): return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer): - # fix encoding for name and e-mail address - if user["name"] is not None and user["name"] != "": - name = unicode(user["name"]).encode("utf-8") - else: - name = unicode(user["username"]).encode("utf-8") - mail = unicode(user["email"]).encode("utf-8") # empty + + # ensure string representation for name and e-mail address + name = str(user["name"]) if "name" in user else str(user["username"]) + mail = str(user["email"]) # may be empty + # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: - log.devinfo("Returning person id for user '{}' from buffer.".format(user_string)) + log.info("Returning person id for user '{}' from buffer.".format(user_string)) return buffer_db_ids[user_string] # get person information from ID service - log.devinfo("Passing user '{}' to ID service.".format(user_string)) + log.info("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer @@ -631,16 +636,17 @@ def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: - log.devinfo("Returning user '{}' from buffer.".format(idx)) + log.info("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service - log.devinfo("Passing user id '{}' to ID service.".format(idx)) + log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) - user = dict() - user["email"] = person["email1"] # column "email1" - user["name"] = person["name"] # column "name" - user["id"] = person["id"] # column "id" + user = { + "name": person["name"], + "email": person["email1"], + "id": person["id"] + } # add user information to buffer buffer_db[idx] = user @@ -1000,8 +1006,8 @@ def find_first_existing(source_folder, filenames): :return: the first existing file name, None otherwise """ - filenames = map(lambda fi: os.path.join(source_folder, fi), filenames) - existing = map(lambda fi: os.path.exists(fi), filenames) + filenames = [os.path.join(source_folder, fi) for fi in filenames] + existing = [os.path.exists(fi) for fi in filenames] first = next((i for (i, x) in enumerate(existing) if x), None) if first is not None: @@ -1020,17 +1026,17 @@ def find_first_existing(source_folder, filenames): log.error("Person files '{}' do not exist! Exiting early...".format(person_files)) sys.exit(-1) - log.devinfo("Loading person csv from file '{}'...".format(srcfile)) + log.info("Loading person csv from file '{}'...".format(srcfile)) with open(srcfile, "r") as f: person_data = csv.DictReader(f, delimiter=",", skipinitialspace=True) persons_by_username = {} persons_by_name = {} for row in person_data: - if not row["AuthorID"] in persons_by_username.keys(): - author_id_utf8 = unicode(row["AuthorID"]).encode("utf-8") + if row["AuthorID"] not in list(persons_by_username.keys()): + author_id_utf8 = str(row["AuthorID"]).encode("utf-8") persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"]) - if not row["AuthorName"] in persons_by_name.keys(): - author_name_utf8 = unicode(row["AuthorName"]).encode("utf-8") + if row["AuthorName"] not in list(persons_by_name.keys()): + author_name_utf8 = str(row["AuthorName"]).encode("utf-8") persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"]) persons = dict() diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py index fd9fd59..1ad12a5 100644 --- a/mbox_parsing/mbox_parsing.py +++ b/mbox_parsing/mbox_parsing.py @@ -15,6 +15,8 @@ # Copyright 2017 by Raphael Nömmer # Copyright 2017-2019 by Claus Hunsen # Copyright 2018-2019 by Thomas Bock +# Copyright 2026 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract artifact occurrences in e-mail within mbox files. @@ -28,17 +30,20 @@ import shutil import sys from os.path import abspath +from logging import getLogger -from codeface.cli import log -from codeface.configuration import Configuration from joblib import Parallel, delayed from whoosh import index # import create_in, open_dir, exists_in from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, TEXT, ID from whoosh.qparser import QueryParser +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +# create logger +setup_logging() +log = getLogger(__name__) def __get_index(mbox, mbox_path, results_folder, schema, reindex): """Initialize the search index (and create it, if needed @@ -56,25 +61,25 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex): index_path = os.path.join(results_folder, "mbox-index", os.path.basename(mbox_path)) # 1) if reindexing, remove the index folder if os.path.exists(index_path) and reindex: - log.devinfo("Removing index from path '{}'...".format(index_path)) + log.info("Removing index from path '{}'...".format(index_path)) shutil.rmtree(index_path) # 2) Check if we need to create the index for Whoosh full-text search - log.devinfo("Checking for index in results folder...") + log.info("Checking for index in results folder...") if (not os.path.exists(index_path)) or (not index.exists_in(index_path)): # 2.1) create index - log.devinfo("Creating index for text search in results folder.") + log.info("Creating index for text search in results folder.") os.makedirs(index_path) # create path index.create_in(index_path, schema) # initialize as index path ix = index.open_dir(index_path) # open as index path writer = ix.writer() # add all messages to index for message in mbox: - writer.add_document(messageID=unicode(message['message-id']), content=__mbox_getbody(message)) + writer.add_document(messageID=str(message['message-id']), content=__mbox_getbody(message)) writer.commit() - log.devinfo("Index created, parsing will begin now.") + log.info("Index created, parsing will begin now.") else: # 2.2) load index - log.devinfo("Index has already been created, parsing will begin right away.") + log.info("Index has already been created, parsing will begin right away.") ix = index.open_dir(index_path) return ix @@ -131,12 +136,12 @@ def __mbox_getbody(message): body = message.get_payload(decode=True) if body is None: - log.devinfo(message.get_content_type()) - log.devinfo( + log.info(message.get_content_type()) + log.info( "An image or some other content has been found that cannot be indexed. Message is given an empty body.") body = ' ' - return unicode(body, errors="replace") + return str(body, errors="replace") def __parse_execute(artifact, schema, my_index, include_filepath): @@ -149,7 +154,7 @@ def __parse_execute(artifact, schema, my_index, include_filepath): :return: a match list of tuples (file name, artifact, message ID) """ - log.devinfo("Searching for artifact ({}, {})...".format(artifact[0], artifact[1])) + log.info("Searching for artifact ({}, {})...".format(artifact[0], artifact[1])) result = [] @@ -247,7 +252,7 @@ def run(): args = parser.parse_args(sys.argv[1:]) __resdir = abspath(args.resdir) __maildir = abspath(args.maildir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) # initialize configuration __conf = Configuration.load(__codeface_conf, __project_conf)