Skip to content

Commit 6fe44f7

Browse files
authored
Merge pull request #11 from quarkslab/update-db-creation-api
update create() function to enable filling entirely the database
2 parents b0845f9 + 591e0d5 commit 6fe44f7

File tree

2 files changed

+193
-43
lines changed

2 files changed

+193
-43
lines changed

src/bindiff/file.py

+144-43
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from pathlib import Path
22
import sqlite3
3-
import hashlib
43
from datetime import datetime
54
from dataclasses import dataclass
65
from typing import Union
76
import ctypes
87

9-
from bindiff.types import FunctionAlgorithm, BasicBlockAlgorithm
8+
from bindiff.types import FunctionAlgorithm, BasicBlockAlgorithm, function_algorithm_str, basicblock_algorithm_str
109

1110

1211
@dataclass
@@ -78,8 +77,10 @@ class BindiffFile(object):
7877
def __init__(self, file: Union[Path, str], permission: str = "ro"):
7978
"""
8079
:param file: path to Bindiff database
81-
:param permission: permission to use for opening database (default: ro)
80+
:param permission: database permissions (default: ro)
8281
"""
82+
assert permission in ["ro", "rw"]
83+
8384
self._file = file
8485

8586
# Open database
@@ -92,13 +93,11 @@ def __init__(self, file: Union[Path, str], permission: str = "ro"):
9293
self.version: str = None #: version of the differ used for diffing
9394
self.created: datetime = None #: Database creation date
9495
self.modified: datetime = None #: Database last modification date
95-
self._load_metadata(self.db.cursor())
96+
9697

9798
# Files
9899
self.primary_file: File = None #: Primary file
99100
self.secondary_file: File = None #: Secondary file
100-
self._load_file(self.db.cursor())
101-
# fmt: on
102101

103102
# Function matches
104103
self.primary_functions_match: dict[
@@ -107,7 +106,6 @@ def __init__(self, file: Union[Path, str], permission: str = "ro"):
107106
self.secondary_functions_match: dict[
108107
int, FunctionMatch
109108
] = {} #: FunctionMatch indexed by addresses in secondary
110-
self._load_function_match(self.db.cursor())
111109

112110
# Basicblock matches: BB-addr -> fun-addr -> match
113111
self.primary_basicblock_match: dict[
@@ -116,13 +114,21 @@ def __init__(self, file: Union[Path, str], permission: str = "ro"):
116114
self.secondary_basicblock_match: dict[
117115
int, dict[int, BasicBlockMatch]
118116
] = {} #: Basic block match from secondary
119-
self._load_basicblock_match(self.db.cursor())
117+
120118

121119
# Instruction matches
122120
# {inst_addr : {match_func_addr : match_inst_addr}}
123121
self.primary_instruction_match: dict[int, dict[int, int]] = {}
124122
self.secondary_instruction_match: dict[int, dict[int, int]] = {}
125-
self._load_instruction_match(self.db.cursor())
123+
124+
# If 'ro', load database content
125+
if permission == "ro":
126+
self._load_metadata(self.db.cursor())
127+
self._load_file(self.db.cursor())
128+
self._load_function_match(self.db.cursor())
129+
self._load_basicblock_match(self.db.cursor())
130+
self._load_instruction_match(self.db.cursor())
131+
126132

127133
@property
128134
def unmatched_primary_count(self) -> int:
@@ -169,7 +175,7 @@ def _load_file(self, cursor: sqlite3.Cursor) -> None:
169175
:param cursor: sqlite3 cursor to the DB
170176
"""
171177
files = cursor.execute("SELECT * FROM file").fetchall()
172-
assert len(files) >= 2
178+
# assert len(files) >= 2
173179

174180
self.primary_file = File(*files[0])
175181
self.secondary_file = File(*files[1])
@@ -268,7 +274,7 @@ def init_database(db: sqlite3.Connection) -> None:
268274
CREATE TABLE metadata (version TEXT, file1 INTEGER, file2 INTEGER, description TEXT, created DATE,
269275
modified DATE, similarity DOUBLE PRECISION, confidence DOUBLE PRECISION,
270276
FOREIGN KEY(file1) REFERENCES file(id), FOREIGN KEY(file2) REFERENCES file(id))""")
271-
conn.execute("""CREATE TABLE functionalgorithm (id SMALLINT PRIMARY KEY, name TEXT)""")
277+
conn.execute("""CREATE TABLE functionalgorithm (id INTEGER PRIMARY KEY, name TEXT)""")
272278
conn.execute("""
273279
CREATE TABLE function (id INTEGER PRIMARY KEY, address1 BIGINT, name1 TEXT, address2 BIGINT,
274280
name2 TEXT, similarity DOUBLE PRECISION, confidence DOUBLE PRECISION, flags INTEGER,
@@ -286,16 +292,11 @@ def init_database(db: sqlite3.Connection) -> None:
286292
db.commit()
287293
# fmt: on
288294

289-
conn.execute(
290-
"""INSERT INTO basicblockalgorithm(name) VALUES ("basicBlock: edges prime product")"""
291-
)
292295
db.commit()
293296

294297
@staticmethod
295298
def create(
296299
filename: str,
297-
primary: str,
298-
secondary: str,
299300
version: str,
300301
desc: str,
301302
similarity: float,
@@ -306,8 +307,6 @@ def create(
306307
It only takes two binaries.
307308
308309
:param filename: database file path
309-
:param primary: path to primary export file
310-
:param secondary: path to secondary export file
311310
:param version: version of the differ used
312311
:param desc: description of the database
313312
:param similarity: similarity score between to two binaries
@@ -320,22 +319,6 @@ def create(
320319

321320
conn = db.cursor()
322321

323-
# Save primary
324-
file1 = Path(primary)
325-
hash1 = hashlib.sha256(file1.read_bytes()).hexdigest() if file1.exists() else ""
326-
conn.execute(
327-
"""INSERT INTO file (filename, exefilename, hash) VALUES (:filename, :name, :hash)""",
328-
{"filename": str(file1.with_suffix("").name), "name": file1.name, "hash": hash1},
329-
)
330-
331-
# Save secondary
332-
file2 = Path(secondary)
333-
hash2 = hashlib.sha256(file2.read_bytes()).hexdigest() if file2.exists() else ""
334-
conn.execute(
335-
"""INSERT INTO file (filename, exefilename, hash) VALUES (:filename, :name, :hash)""",
336-
{"filename": str(file2.with_suffix("").name), "name": file2.name, "hash": hash2},
337-
)
338-
339322
conn.execute(
340323
"""
341324
INSERT INTO metadata (version, file1, file2, description, created, modified, similarity, confidence)
@@ -353,10 +336,89 @@ def create(
353336
},
354337
)
355338

339+
# Fill functionalgorithm table
340+
for algo in FunctionAlgorithm:
341+
algo_str = function_algorithm_str(algo)
342+
conn.execute(
343+
"""INSERT INTO functionalgorithm (name) VALUES (:name)""",
344+
{"name": f"function: {algo_str}"},
345+
)
346+
347+
# Fill basicblockalgorithm table
348+
for algo in BasicBlockAlgorithm:
349+
algo_str = basicblock_algorithm_str(algo)
350+
conn.execute(
351+
"""INSERT INTO basicblockalgorithm (name) VALUES (:name)""",
352+
{"name": f"basicBlock: {algo_str}"},
353+
)
354+
355+
356356
db.commit()
357357
db.close()
358358
return BindiffFile(filename, permission="rw")
359359

360+
def add_file_matched(self,
361+
export_name: str,
362+
hash: str,
363+
executable_name: str = "",
364+
functions: int = 0,
365+
libfunctions: int = 0,
366+
calls: int = 0,
367+
basicblocks: int = 0,
368+
libbasicblocks: int = 0,
369+
edges: int = 0,
370+
libedges: int = 0,
371+
instructions: int = 0,
372+
libinstructions: int = 0):
373+
"""
374+
Add a file matched.
375+
Only export_name and hash are mandatory.
376+
377+
:warning: not providing the other field might not
378+
render correctly in Bindiff, or IDA plugins.
379+
380+
:param export_name: Export filename (with extension).
381+
:param hash: SHA256 hash of the executable
382+
:param executable_name: Executable filename (if none is provided, export without extension)
383+
:param functions: number of functions
384+
:param libfunctions:number of library functions
385+
:param calls: number of calls
386+
:param basicblocks: number of basic blocks
387+
:param libbasicblocks: number of library basic blocks
388+
:param edges: number of CFG edges
389+
:param libedges: number of library CFG edges
390+
:param instructions: number of instructions
391+
:param libinstructions: number of library instructions
392+
:return: None
393+
"""
394+
cursor = self.db.cursor()
395+
396+
export_p = Path(export_name)
397+
398+
params = {
399+
"filename": export_p.with_suffix("").name,
400+
"exefilename": executable_name if executable_name else export_p.with_suffix("").name,
401+
"hash": hash,
402+
"functions": functions,
403+
"libfunctions": libfunctions,
404+
"calls": calls,
405+
"basicblocks": basicblocks,
406+
"libbasicblocks": libbasicblocks,
407+
"edges": edges,
408+
"libedges": libedges,
409+
"instructions": instructions,
410+
"libinstructions": libinstructions
411+
}
412+
413+
keys = list(params)
414+
dotkeys = [f":{x}" for x in keys]
415+
416+
cursor.execute(
417+
f"INSERT INTO file ({','.join(keys)}) VALUES ({','.join(dotkeys)})",
418+
params,
419+
)
420+
421+
360422
def add_function_match(
361423
self,
362424
fun_addr1: int,
@@ -382,8 +444,21 @@ def add_function_match(
382444
cursor = self.db.cursor()
383445
cursor.execute(
384446
"""
385-
INSERT INTO function (address1, address2, name1, name2, similarity, confidence, basicblocks)
386-
VALUES (:address1, :address2, :name1, :name2, :similarity, :confidence, :identical_bbs)
447+
INSERT INTO function (address1,
448+
address2,
449+
name1,
450+
name2,
451+
similarity,
452+
confidence,
453+
flags,
454+
algorithm,
455+
evaluate,
456+
commentsported,
457+
basicblocks,
458+
edges,
459+
instructions)
460+
VALUES (:address1, :address2, :name1, :name2, :similarity,
461+
:confidence, 0, 19, 0, 0, :identical_bbs, 0, 0)
387462
""",
388463
{
389464
"address1": fun_addr1,
@@ -398,13 +473,12 @@ def add_function_match(
398473
return cursor.lastrowid
399474

400475
def add_basic_block_match(
401-
self, fun_addr1: int, fun_addr2: int, bb_addr1: int, bb_addr2: int
476+
self, funentry_id: int, bb_addr1: int, bb_addr2: int
402477
) -> int:
403478
"""
404479
Add a basic block match in database.
405480
406-
:param fun_addr1: function address of basic block in primary
407-
:param fun_addr2: function address of basic block in secondary
481+
:param funentry_id: Db Id of the function match
408482
:param bb_addr1: basic block address in primary
409483
:param bb_addr2: basic block address in secondary
410484
:return: id of the row inserted in database.
@@ -413,15 +487,15 @@ def add_basic_block_match(
413487

414488
cursor.execute(
415489
"""
416-
INSERT INTO basicblock (functionid, address1, address2, algorithm)
417-
VALUES ((SELECT id FROM function WHERE address1=:function_address1 AND address2=:function_address2), :address1, :address2, :algorithm)
490+
INSERT INTO basicblock (functionid, address1, address2, algorithm, evaluate)
491+
VALUES (:funentry_id, :address1, :address2, :algorithm, :evaluate)
418492
""",
419493
{
420-
"function_address1": fun_addr1,
421-
"function_address2": fun_addr2,
494+
"funentry_id": funentry_id,
422495
"address1": bb_addr1,
423496
"address2": bb_addr2,
424497
"algorithm": "1",
498+
"evaluate": "0"
425499
},
426500
)
427501
return cursor.lastrowid
@@ -475,4 +549,31 @@ def update_file_infos(
475549
"instructions": inst_count,
476550
},
477551
)
552+
553+
554+
def update_samebb_function_match(
555+
self, funentry_id: int, same_bb_count: int) -> None:
556+
"""
557+
Update same basicblock information in function table
558+
559+
:param funentry_id: id of function matvch entry
560+
:param same_bb_count: number of identical basic blocks
561+
"""
562+
cursor = self.db.cursor()
563+
564+
cursor.execute(
565+
"""
566+
UPDATE function SET basicblocks = :bb_count WHERE id = :entry_id
567+
""",
568+
{
569+
"entry_id": str(funentry_id),
570+
"bb_count": same_bb_count
571+
},
572+
)
573+
574+
575+
def commit(self) -> None:
576+
"""
577+
Commit all pending transaction in the database.
578+
"""
478579
self.db.commit()

src/bindiff/types.py

+49
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,32 @@ class BasicBlockAlgorithm(IntEnum):
3838
manual = 20
3939

4040

41+
def basicblock_algorithm_str(algo: BasicBlockAlgorithm) -> str:
42+
match algo:
43+
case BasicBlockAlgorithm.edges_prime_product: return "edges prime product"
44+
case BasicBlockAlgorithm.hash_matching_four_inst_min: return "hash matching (4 instructions minimum)"
45+
case BasicBlockAlgorithm.prime_matching_four_inst_min: return "prime matching (4 instructions minimum)"
46+
case BasicBlockAlgorithm.call_reference_matching: return "call reference matching"
47+
case BasicBlockAlgorithm.string_references_matching: return "string reference matching"
48+
case BasicBlockAlgorithm.edges_md_index_top_down: return "edges MD index (top down)"
49+
case BasicBlockAlgorithm.md_index_matching_top_down: return "MD index matching (top down)"
50+
case BasicBlockAlgorithm.edges_md_index_bottom_up: return "edges MD index (bottom up)"
51+
case BasicBlockAlgorithm.md_index_matching_bottom_up: return "MD index matching (bottom up)"
52+
case BasicBlockAlgorithm.relaxed_md_index_matching: return "relaxed MD index matching"
53+
case BasicBlockAlgorithm.prime_matching_no_inst_min: return "prime matching (0 instructions minimum)"
54+
case BasicBlockAlgorithm.edges_lengauer_tarjan_dominated: return "edges Lengauer Tarjan dominated"
55+
case BasicBlockAlgorithm.loop_entry_matching: return "loop entry matching"
56+
case BasicBlockAlgorithm.self_loop_matching: return "self loop matching"
57+
case BasicBlockAlgorithm.entry_point_matching: return "entry point matching"
58+
case BasicBlockAlgorithm.exit_point_matching: return "exit point matching"
59+
case BasicBlockAlgorithm.instruction_count_matching: return "instruction count matching"
60+
case BasicBlockAlgorithm.jump_sequence_matching: return "jump sequence matching"
61+
case BasicBlockAlgorithm.propagation_size_one: return "propagation (size==1)"
62+
case BasicBlockAlgorithm.manual: return "manual"
63+
case _:
64+
assert False
65+
66+
4167
class FunctionAlgorithm(IntEnum):
4268
"""
4369
Function matching algorithm enum. (id's does not seem to change in
@@ -63,3 +89,26 @@ class FunctionAlgorithm(IntEnum):
6389
call_sequence_matching_sequence = 17
6490
call_reference_matching = 18
6591
manual = 19
92+
93+
def function_algorithm_str(algo: FunctionAlgorithm) -> str:
94+
match algo:
95+
case FunctionAlgorithm.name_hash_matching: return "name hash matching"
96+
case FunctionAlgorithm.hash_matching: return "hash matching"
97+
case FunctionAlgorithm.edges_flowgraph_md_index: return "edges flowgraph MD index"
98+
case FunctionAlgorithm.edges_callgraph_md_index: return "edges callgraph MD index"
99+
case FunctionAlgorithm.md_index_matching_flowgraph_top_down: return "MD index matching (flowgraph MD index, top down)"
100+
case FunctionAlgorithm.md_index_matching_flowgraph_bottom_up: return "MD index matching (flowgraph MD index, bottom up)"
101+
case FunctionAlgorithm.prime_signature_matching: return "signature matching"
102+
case FunctionAlgorithm.md_index_matching_callGraph_top_down: return "MD index matching (callGraph MD index, top down)"
103+
case FunctionAlgorithm.md_index_matching_callGraph_bottom_up: return "MD index matching (callGraph MD index, bottom up)"
104+
case FunctionAlgorithm.relaxed_md_index_matching: return "MD index matching"
105+
case FunctionAlgorithm.instruction_count: return "instruction count"
106+
case FunctionAlgorithm.address_sequence: return "address sequence"
107+
case FunctionAlgorithm.string_references: return "string references"
108+
case FunctionAlgorithm.loop_count_matching: return "loop count matching"
109+
case FunctionAlgorithm.call_sequence_matching_exact: return "call sequence matching(exact)"
110+
case FunctionAlgorithm.call_sequence_matching_topology: return "call sequence matching(topology)"
111+
case FunctionAlgorithm.call_sequence_matching_sequence: return "call sequence matching(sequence)"
112+
case FunctionAlgorithm.call_reference_matching: return "call rerferences matching"
113+
case FunctionAlgorithm.manual: return "manual"
114+
case _: assert False

0 commit comments

Comments
 (0)