Merge pull request #141 from tekaracybersolutions/ps_everywhere-enhan…

…cement chg: adopt ps_everywhere analyzer to use JSONL format
EC-DIGIT-CSIRC · Feb 25, 2025 · ca86f29 · ca86f29
2 parents 3533e41 + 65ef0e3
commit ca86f29
Show file tree

Hide file tree

Showing 2 changed files with 241 additions and 110 deletions.
diff --git a/src/sysdiagnose/analysers/ps_everywhere.py b/src/sysdiagnose/analysers/ps_everywhere.py
@@ -1,6 +1,7 @@
 #! /usr/bin/env python3
 
-from sysdiagnose.utils.base import BaseAnalyserInterface
+from typing import Generator
+from sysdiagnose.utils.base import BaseAnalyserInterface, logger
 from sysdiagnose.parsers.ps import PsParser
 from sysdiagnose.parsers.psthread import PsThreadParser
 from sysdiagnose.parsers.spindumpnosymbols import SpindumpNoSymbolsParser
@@ -12,122 +13,254 @@
 
 
 class PsEverywhereAnalyser(BaseAnalyserInterface):
+    """
+    Analyser that gathers process information from multiple sources
+    to build a comprehensive list of running processes across different system logs.
+    """
+
     description = "List all processes we can find a bit everywhere."
-    format = "json"
+    format = "jsonl"
 
     def __init__(self, config: dict, case_id: str):
         super().__init__(__file__, config, case_id)
         self.all_ps = set()
 
-    def execute(self):
-        # the order of below is important: we want to have the most detailed information first
-        # - first processes with full path and parameters
-        # - then processes with full path and no parameters
-        # - then processes no full path and no parameters
-
-        # processes with full path and parameters, no threads
-        ps_json = PsParser(self.config, self.case_id).get_result()
-        self.all_ps.update([p['command'] for p in ps_json])
-        print(f"{len(self.all_ps)} entries after ps")
-
-        # processes with full path and parameters
-
-        psthread_json = PsThreadParser(self.config, self.case_id).get_result()
-        self.all_ps.update([p['command'] for p in psthread_json])
-        print(f"{len(self.all_ps)} entries after psthread")
-
-        # processes with full path, no parameters, with threads
-        spindumpnosymbols_json = SpindumpNoSymbolsParser(self.config, self.case_id).get_result()
-        for p in spindumpnosymbols_json:
-            if 'process' not in p:
-                continue
-            try:
-                self.add_if_full_command_is_not_in_set(p['path'])
-                # all_ps.add(f"{p['path']}::#{len(p['threads'])}") # count is different than in taskinfo
-            except KeyError:
-                if p['process'] == 'kernel_task [0]':
-                    self.all_ps.add('/kernel')  # is similar to the other formats
-                else:
-                    self.add_if_full_command_is_not_in_set(p['process'])  # backup uption to keep trace of this anomaly
-            for t in p['threads']:
-                try:
-                    self.add_if_full_command_is_not_in_set(f"{p['path']}::{t['thread_name']}")
-                except KeyError:
-                    pass
-        print(f"{len(self.all_ps)} entries after spindumpnosymbols")
-
-        # processes with full path, no parameters, no threads
-        shutdownlogs_json = ShutdownLogsParser(self.config, self.case_id).get_result()
-        for p in shutdownlogs_json:
-            # not using 'path' but 'command', as the path being appended by the UUID will be counter productive to normalisation
-            self.add_if_full_command_is_not_in_set(p['command'])
-        print(f"{len(self.all_ps)} entries after shutdownlogs")
-
-        # processes with full path, no parameters, no threads
-        logarchive_procs = set()
-        for event in LogarchiveParser(self.config, self.case_id).get_result():
-            try:
-                logarchive_procs.add(event['process'])
-            except KeyError:
-                pass
-
-        for entry in logarchive_procs:
-            self.add_if_full_command_is_not_in_set(entry)
-        print(f"{len(self.all_ps)} entries after logarchive")
-
-        # processes with full path, no parameters, no threads
-        uuid2path_json = UUID2PathParser(self.config, self.case_id).get_result()
-        for item in uuid2path_json.values():
-            self.add_if_full_command_is_not_in_set(item)
-        print(f"{len(self.all_ps)} entries after uuid2path")
-
-        # processes no full path, no parameters, with threads
-        taskinfo_json = TaskinfoParser(self.config, self.case_id).get_result()
-        # p['name'] is the short version of COMMAND, so incompatible with the other formats.
-        # on the other hand it may contain valuable stuff, so we use it in 2 formats
-        # - name::#num_of_threads
-        # - name::thread name
-        for p in taskinfo_json:
-            if 'name' not in p:
-                continue
-            self.add_if_full_path_is_not_in_set(p['name'])
-            # add_if_full_path_is_not_in_set(f"{p['name']}::#{len(p['threads'])}") # count is different than in spindumpnosymbols
-            for t in p['threads']:
-                try:
-                    self.add_if_full_path_is_not_in_set(f"{p['name']}::{t['thread name']}")
-                except KeyError:
-                    pass
-        print(f"{len(self.all_ps)} entries after taskinfo")
-
-        # processes no full path, no parameters, no threads
-        remotectl_dumpstate_json = RemotectlDumpstateParser(self.config, self.case_id).get_result()
-        if remotectl_dumpstate_json:
-            for p in remotectl_dumpstate_json['Local device']['Services']:
-                self.add_if_full_path_is_not_in_set(p)
-
-        print(f"{len(self.all_ps)} entries after remotectl_dumpstate")
-
-        # TODO powerlogs - bundleID, ProcessName
-
-        self.all_ps = list(self.all_ps)
-        self.all_ps.sort()
-        return self.all_ps
-
-    def add_if_full_path_is_not_in_set(self, name: str):
+    @staticmethod
+    def _strip_flags(process: str) -> str:
+        """
+        Extracts the base command by removing everything after the first space.
+
+        :param process: Full process command string.
+        :return: Command string without flags.
+        """
+        process, *_ = process.partition(' ')
+        return process
+
+    def execute(self) -> Generator[dict, None, None]:
+        """
+        Executes all extraction methods dynamically, ensuring that each extracted process is unique.
+
+        :yield: A dictionary containing process details from various sources.
+        """
+        for func in dir(self):
+            if func.startswith(f"_{self.__class__.__name__}__extract_ps_"):
+                yield from getattr(self, func)()  # Dynamically call extract methods
+
+    def __extract_ps_base_file(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from ps.txt.
+
+        :return: A generator yielding dictionaries containing process details from ps.txt.
+        """
+        entity_type = 'ps.txt'
+        try:
+            for p in PsParser(self.config, self.case_id).get_result():
+                ps_event = {
+                    'process': self._strip_flags(p['command']),
+                    'timestamp': p['timestamp'],
+                    'datetime': p['datetime'],
+                    'source': entity_type
+                }
+                if self.add_if_full_command_is_not_in_set(ps_event['process']):
+                    yield ps_event
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type} file. {e}")
+
+    def __extract_ps_thread_file(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from psthread.txt.
+
+        :return: A generator yielding dictionaries containing process details from psthread.txt.
+        """
+        entity_type = 'psthread.txt'
+        try:
+            for p in PsThreadParser(self.config, self.case_id).get_result():
+                ps_event = {
+                    'process': self._strip_flags(p['command']),
+                    'timestamp': p['timestamp'],
+                    'datetime': p['datetime'],
+                    'source': entity_type
+                }
+                if self.add_if_full_command_is_not_in_set(ps_event['process']):
+                    yield ps_event
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type} file. {e}")
+
+    def __extract_ps_spindump_nosymbols_file(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from spindump-nosymbols.txt.
+
+        :return: A generator yielding dictionaries containing process and thread details from spindump-nosymbols.txt.
+        """
+        entity_type = 'spindump-nosymbols.txt'
+        try:
+            for p in SpindumpNoSymbolsParser(self.config, self.case_id).get_result():
+                if 'process' not in p:
+                    continue
+                process_name = p.get('path', '/kernel' if p['process'] == 'kernel_task [0]' else p['process'])
+
+                if self.add_if_full_command_is_not_in_set(self._strip_flags(process_name)):
+                    yield {
+                        'process': self._strip_flags(process_name),
+                        'timestamp': p['timestamp'],
+                        'datetime': p['datetime'],
+                        'source': entity_type
+                    }
+
+                for t in p['threads']:
+                    try:
+                        thread_name = f"{self._strip_flags(process_name)}::{t['thread_name']}"
+                        if self.add_if_full_command_is_not_in_set(thread_name):
+                            yield {
+                                'process': thread_name,
+                                'timestamp': p['timestamp'],
+                                'datetime': p['datetime'],
+                                'source': entity_type
+                            }
+                    except KeyError:
+                        pass
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type} file. {e}")
+
+    def __extract_ps_shutdownlogs(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from shutdown logs.
+
+        :return: A generator yielding dictionaries containing process details from shutdown logs.
+        """
+        entity_type = 'shutdown.logs'
+        try:
+            for p in ShutdownLogsParser(self.config, self.case_id).get_result():
+                if self.add_if_full_command_is_not_in_set(self._strip_flags(p['command'])):
+                    yield {
+                        'process': self._strip_flags(p['command']),
+                        'timestamp': p['timestamp'],
+                        'datetime': p['datetime'],
+                        'source': entity_type
+                    }
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type}. {e}")
+
+    def __extract_ps_logarchive(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from logarchive.
+
+        :return: A generator yielding dictionaries containing process details from logarchive.
+        """
+        entity_type = 'log archive'
+        try:
+            for p in LogarchiveParser(self.config, self.case_id).get_result():
+                if self.add_if_full_command_is_not_in_set(self._strip_flags(p['process'])):
+                    yield {
+                        'process': self._strip_flags(p['process']),
+                        'timestamp': p['timestamp'],
+                        'datetime': p['datetime'],
+                        'source': entity_type
+                    }
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type}. {e}")
+
+    def __extract_ps_uuid2path(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from UUID2PathParser.
+
+        :return: A generator yielding process data from uuid2path.
+        """
+        entity_type = 'uuid2path'
+        try:
+            for p in UUID2PathParser(self.config, self.case_id).get_result().values():
+                if self.add_if_full_command_is_not_in_set(self._strip_flags(p)):
+                    yield {
+                        'process': self._strip_flags(p),
+                        'timestamp': None,
+                        'datetime': None,
+                        'source': entity_type
+                    }
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type}. {e}")
+
+    def __extract_ps_taskinfo(self) -> Generator[dict, None, None]:
+        """
+        Extracts process and thread information from TaskinfoParser.
+
+        :return: A generator yielding process and thread information from taskinfo.
+        """
+        entity_type = 'taskinfo.txt'
+        try:
+            for p in TaskinfoParser(self.config, self.case_id).get_result():
+                if 'name' not in p:
+                    continue
+
+                if self.add_if_full_path_is_not_in_set(self._strip_flags(p['name'])):
+                    yield {
+                        'process': self._strip_flags(p['name']),
+                        'timestamp': p['timestamp'],
+                        'datetime': p['datetime'],
+                        'source': entity_type
+                    }
+
+                for t in p['threads']:
+                    try:
+                        thread_name = f"{self._strip_flags(p['name'])}::{t['thread name']}"
+                        if self.add_if_full_path_is_not_in_set(thread_name):
+                            yield {
+                                'process': thread_name,
+                                'timestamp': p['timestamp'],
+                                'datetime': p['datetime'],
+                                'source': entity_type
+                            }
+                    except KeyError:
+                        pass
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type}. {e}")
+
+    def __extract_ps_remotectl_dumpstate(self) -> Generator[dict, None, None]:
+        """
+        Extracts process data from RemotectlDumpstateParser.
+
+        :return: A generator yielding process data from remotectl_dumpstate.txt.
+        """
+        entity_type = 'remotectl_dumpstate.txt'
+        try:
+            remotectl_dumpstate_json = RemotectlDumpstateParser(self.config, self.case_id).get_result()
+            if remotectl_dumpstate_json:
+                for p in remotectl_dumpstate_json['Local device']['Services']:
+                    if self.add_if_full_path_is_not_in_set(self._strip_flags(p)):
+                        yield {
+                            'process': self._strip_flags(p),
+                            'timestamp': None,
+                            'datetime': None,
+                            'source': entity_type
+                        }
+        except Exception as e:
+            logger.exception(f"ERROR while extracting {entity_type}. {e}")
+
+    def add_if_full_path_is_not_in_set(self, name: str) -> bool:
+        """
+        Ensures that a process path is unique before adding it to the shared set.
+
+        :param name: Process path name
+        :return: True if the process was not in the set and was added, False otherwise.
+        """
         for item in self.all_ps:
-            # no need to add it in the following cases
             if item.endswith(name):
-                return
-            if item.split('::').pop(0).endswith(name):
-                return
-            if '::' not in item and item.split(' ').pop(0).endswith(name):
-                # this will but with commands that have a space, but looking at data this should not happend often
-                return
+                return False
+            if item.split('::')[0].endswith(name):
+                return False
+            if '::' not in item and item.split(' ')[0].endswith(name):
+                return False  # This covers cases with space-separated commands
         self.all_ps.add(name)
+        return True
+
+    def add_if_full_command_is_not_in_set(self, name: str) -> bool:
+        """
+        Ensures that a process command is unique before adding it to the shared set.
 
-    def add_if_full_command_is_not_in_set(self, name: str):
+        :param name: Process command name
+        :return: True if the process was not in the set and was added, False otherwise.
+        """
         for item in self.all_ps:
             if item.startswith(name):
-                # no need to add it
-                return
+                return False
         self.all_ps.add(name)
+        return True
diff --git a/tests/test_analysers_ps_everywhere.py b/tests/test_analysers_ps_everywhere.py
@@ -11,12 +11,10 @@ def test_analyse_ps_everywhere(self):
             # run the analyser
             a = PsEverywhereAnalyser(self.sd.config, case_id=case_id)
             a.save_result(force=True)
+
             self.assertTrue(os.path.isfile(a.output_file))
             self.assertTrue(os.path.getsize(a.output_file) > 0)
 
-            result = a.get_result()
-            self.assertGreater(len(result), 0)
-
 
 if __name__ == '__main__':
     unittest.main()