From 7e51802a3255ed025a661b546b6318b531a69383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=BDubo=C5=A1=20Bever?= Date: Wed, 15 May 2024 23:58:24 +0200 Subject: [PATCH] Add TLSH Distance implementation with tests and documentation Add three signatures of tlsh_diff function used to compute TLSH distance from the whole file, part of the file or from the string. Add unit tests and add information about implementation into hash module documentation. --- docs/modules/hash.rst | 35 ++++ libyara/include/tlshc/tlsh.h | 4 +- libyara/modules/hash/hash.c | 391 +++++++++++++++++++++++++++++++++++ libyara/tlshc/tlsh.c | 24 ++- tests/test-rules.c | 57 +++++ 5 files changed, 509 insertions(+), 2 deletions(-) diff --git a/docs/modules/hash.rst b/docs/modules/hash.rst index 8829ed2cc8..5e62e23a2d 100644 --- a/docs/modules/hash.rst +++ b/docs/modules/hash.rst @@ -10,6 +10,14 @@ Hash module The Hash module allows you to calculate hashes (MD5, SHA1, SHA256) from portions of your file and create signatures based on those hashes. +It also allows you to work with Locality Sensitive Hashes from Trend Micro (TLSH). +Specifically, you are able to compute the distance between TLSH of the portions of +your file (min. 50 bytes) and input TLSH string. The distance scores can go up to +1000 and even above. A low score (of 50 or less) means that files are quite similar, +while the distance of zero means (very likely) the exact match. Just like MD5 and +SHA1 schemes, collisions can occur and very different files will have the same hash +value. + .. important:: This module depends on the OpenSSL library. Please refer to :ref:`compiling-yara` for information about how to build OpenSSL-dependant @@ -24,6 +32,14 @@ of your file and create signatures based on those hashes. requires the hash string to be given in lowercase, otherwise the match condition will not work. (see https://github.com/VirusTotal/yara/issues/1004) + The TLSH is not valid in lowercase. Therefore, the input hash must be in uppercase which differ + against traditional hash functions. The module accepts TLSH either with or without the first + byte "T1" specifying the version of TLSH. + + DISCLAIMER: Computing TLSH is very slow, comparable with SSDEEP hashing which means approx. + 5.4 times slower than SHA1 function. Adding `tlsh_diff` function into YARA rule can extend + its evaluation up to 15%. Be especially careful while scanning files bigger than 5 MB. + .. c:function:: md5(offset, size) Returns the MD5 hash for *size* bytes starting at *offset*. When scanning a @@ -78,3 +94,22 @@ of your file and create signatures based on those hashes. Returns a crc32 checksum for the given string. +.. c:function:: tlsh_diff(tlsh) + Computes the TLSH hash for the whole file (the offset is set to zero and + size is set to size of the file). The returned integer is the difference + between computed TLSH hash and *tlsh* hash string. + + *Example: hash.tlsh_diff("T1A4315014DC89DDDDFB6246C177B3B52BA818B01142CCF89682EACC07D800F79C64BB52") < 50* + +.. c:function:: tlsh_diff(tlsh, offset, size) + Computes the TLSH hash for the *size* bytes starting at *offset*. When + scanning a running process the *offset* argument should be a virtual address + within the process address space. The returned integer is the difference + between computed TLSH hash and *tlsh* hash string. + + *Example: hash.tlsh_diff("A4315014DC89DDDDFB6246C177B3B52BA818B01142CCF89682EACC07D800F79C64BB52", 0, filesize) == 0* + +.. c:function:: tlsh_diff(tlsh, string) + Computes the TLSH hash for the *string* of content. The returned integer + is the difference between computed TLSH hash and *tlsh* hash string. + diff --git a/libyara/include/tlshc/tlsh.h b/libyara/include/tlshc/tlsh.h index 6d8be31ede..ebe9682e87 100644 --- a/libyara/include/tlshc/tlsh.h +++ b/libyara/include/tlshc/tlsh.h @@ -55,10 +55,12 @@ void tlsh_free(Tlsh* tlsh); void tlsh_reset(Tlsh* tlsh); int tlsh_update(Tlsh* tlsh, const unsigned char* data, unsigned int len); int tlsh_final(Tlsh* tlsh, const unsigned char* data, unsigned int len, int tlsh_option); +int tlsh_total_diff(Tlsh* tlsh, Tlsh* other, bool len_diff); +int tlsh_from_tlsh_str(Tlsh* tlsh, const char* str); const char* tlsh_get_hash(Tlsh* tlsh, bool showvers); #ifdef __cplusplus } #endif -#endif // __TLSH_TLSH_H__ \ No newline at end of file +#endif // __TLSH_TLSH_H__ diff --git a/libyara/modules/hash/hash.c b/libyara/modules/hash/hash.c index 51f500768a..1e0c2232d0 100644 --- a/libyara/modules/hash/hash.c +++ b/libyara/modules/hash/hash.c @@ -27,6 +27,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -162,6 +163,174 @@ static int add_to_cache( return result; } +static Tlsh* get_tlsh_pivot(SIZED_STRING* ss_tlsh) +{ + // validation of input TLSH string + if (ss_tlsh->length != TLSH_STRING_LEN_REQ && ss_tlsh->length != TLSH_STRING_LEN_REQ-2) + return NULL; + + Tlsh* tlsh = tlsh_new(); + if (tlsh == NULL) + { + tlsh_free(tlsh); + + return NULL; + } + + if (tlsh_from_tlsh_str(tlsh, ss_string(ss_tlsh)) != 0) + { + tlsh_free(tlsh); + + return NULL; + } + + return tlsh; +} + +static bool validate_tlsh_data(const int tlsh_option, const int64_t data_len) +{ + // validation of input data + if ((tlsh_option & TLSH_OPTION_CONSERVATIVE) == 0) + { + if (data_len < MIN_DATA_LENGTH) + return false; + } + else + { + if (data_len < MIN_CONSERVATIVE_DATA_LENGTH) + return false; + } + + return true; +} + +static const char* get_tlsh_ascii(Tlsh* tlsh, const int showvers) +{ + const char* digest_ascii = tlsh_get_hash(tlsh, showvers); + if (digest_ascii && !digest_ascii[0]) + return NULL; + + return digest_ascii; +} + +static bool check_mem_block(const YR_MEMORY_BLOCK* block, const uint64_t offset, const uint64_t length, const char* func_name) +{ + if (block == NULL) + { + YR_DEBUG_FPRINTF( + 2, stderr, "} // %s() = YR_UNDEFINED // block == NULL\n", func_name); + + return false; + } + + if (offset < 0 || length < 0 || offset < block->base) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // bad offset / length\n", + func_name); + + return false; + } + + return true; +} + +static bool get_tlsh_digest( + Tlsh* tlsh, const int tlsh_option, + YR_MEMORY_BLOCK* block, YR_MEMORY_BLOCK_ITERATOR* iterator, + int64_t offset, int64_t length, + YR_OBJECT* module, const char* func_name) +{ + char* cached_ascii_digest = get_from_cache( + module, "tlsh_diff", offset, length); + + if (cached_ascii_digest != NULL) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = %s (cached)\n", + func_name, + cached_ascii_digest); + + if (tlsh_from_tlsh_str(tlsh, cached_ascii_digest) != 0) + { + + return false; + } + } + else + { + int past_first_block = false; + + foreach_memory_block(iterator, block) + { + // if desired block within current block + if (offset >= block->base && offset < block->base + block->size) + { + const uint8_t* block_data = block->fetch_data(block); + + if (block_data != NULL) + { + size_t data_offset = (size_t) (offset - block->base); + size_t data_len = (size_t) yr_min(length, block->size - data_offset); + + offset += data_len; + length -= data_len; + + if (tlsh_update(tlsh, block_data + data_offset, data_len) != 0) + { + + return false; + } + } + + past_first_block = true; + } + else if (past_first_block) + { + // If offset is not within current block and we already + // past the first block then the we are trying to compute + // the checksum over a range of non contiguous blocks. As + // range contains gaps of undefined data the checksum is + // undefined. + + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // past_first_block\n", + func_name); + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + return false; + } + + if (block->base + block->size > offset + length) + break; + } + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + if (!past_first_block) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // !past_first_block\n", + func_name); + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + return false; + } + } + + return true; +} + define_function(string_md5) { unsigned char digest[YR_MD5_LEN]; @@ -808,6 +977,224 @@ define_function(data_crc32) return_integer(checksum ^ 0xFFFFFFFF); } +define_function(file_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + + Tlsh* tlsh_pivot = get_tlsh_pivot(s); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_SCAN_CONTEXT* context = yr_scan_context(); + YR_MEMORY_BLOCK* block = first_memory_block(context); + YR_MEMORY_BLOCK_ITERATOR* iterator = context->iterator; + int64_t offset = 0; + int64_t length = block->size; + + YR_DEBUG_FPRINTF( + 2, + stderr, + "+ %s(offset=%" PRIi64 " length=%" PRIi64 " hash=%s) {\n", + func_name, + offset, + length, + ss_string(s)); + + if (!validate_tlsh_data(tlsh_option, length)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!check_mem_block(block, offset, length, __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!get_tlsh_digest(tlsh, tlsh_option, block, iterator, offset, length, yr_module(), __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR( + add_to_cache(yr_module(), "tlsh_diff", offset, length, digest_ascii)); + + YR_DEBUG_FPRINTF(2, stderr, "} // %s() = 0x%s\n", __FUNCTION__, digest_ascii); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + +define_function(data_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + int64_t arg_offset = integer_argument(2); // offset where to start + int64_t arg_length = integer_argument(3); // length of bytes we want hash on + + int64_t offset = arg_offset; + int64_t length = arg_length; + + Tlsh* tlsh_pivot = get_tlsh_pivot(s); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_SCAN_CONTEXT* context = yr_scan_context(); + YR_MEMORY_BLOCK* block = first_memory_block(context); + YR_MEMORY_BLOCK_ITERATOR* iterator = context->iterator; + + YR_DEBUG_FPRINTF( + 2, + stderr, + "+ %s(offset=%" PRIi64 " length=%" PRIi64 " hash=%s) {\n", + func_name, + offset, + length, + ss_string(s)); + + if (!validate_tlsh_data(tlsh_option, length)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!check_mem_block(block, offset, length, __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!get_tlsh_digest(tlsh, tlsh_option, block, iterator, offset, length, yr_module(), __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR( + add_to_cache(yr_module(), "tlsh_diff", arg_offset, arg_length, digest_ascii)); + + YR_DEBUG_FPRINTF(2, stderr, "} // %s() = 0x%s\n", __FUNCTION__, digest_ascii); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + +define_function(string_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s1 = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + SIZED_STRING* s2 = sized_string_argument(2); // content string we want to compute TLSH diff from + + if (!validate_tlsh_data(tlsh_option, s2->length)) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh_pivot = get_tlsh_pivot(s1); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR_WITH_CLEANUP( + tlsh_final(tlsh, (const unsigned char*) ss_string(s2), s2->length, tlsh_option), + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_DEBUG_FPRINTF( + 2, + stderr, + "- %s() {} = 0x%s // s2->length=%u\n", + __FUNCTION__, + digest_ascii, + s2->length); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + begin_declarations declare_function("md5", "ii", "s", data_md5); declare_function("md5", "s", "s", string_md5); @@ -823,6 +1210,10 @@ begin_declarations declare_function("crc32", "ii", "i", data_crc32); declare_function("crc32", "s", "i", string_crc32); + + declare_function("tlsh_diff", "s", "i", file_tlsh_diff); + declare_function("tlsh_diff", "sii", "i", data_tlsh_diff); + declare_function("tlsh_diff", "ss", "i", string_tlsh_diff); end_declarations int module_initialize(YR_MODULE* module) diff --git a/libyara/tlshc/tlsh.c b/libyara/tlshc/tlsh.c index 534a35b7b0..c3a7476c29 100644 --- a/libyara/tlshc/tlsh.c +++ b/libyara/tlshc/tlsh.c @@ -1,3 +1,4 @@ +#include #include #include #include "tlsh_impl.h" @@ -71,10 +72,31 @@ int tlsh_final( return 0; } + +int tlsh_total_diff(Tlsh* tlsh, Tlsh* other, bool len_diff) +{ + if (!tlsh->impl || !other || !other->impl) + return -(EINVAL); + else if (tlsh == other || tlsh_impl_compare(tlsh->impl, other->impl) == 0) + return 0; + else + return tlsh_impl_total_diff(tlsh->impl, other->impl, len_diff); +} + +int tlsh_from_tlsh_str(Tlsh* tlsh, const char* str) +{ + if (!tlsh->impl) + return -(ENOMEM); + else if (!str) + return -(EINVAL); + else + return tlsh_impl_from_tlsh_str(tlsh->impl, str); +} + const char* tlsh_get_hash(Tlsh* tlsh, bool showvers) { if (tlsh->impl) return tlsh_impl_hash(tlsh->impl, showvers); else return ""; -} \ No newline at end of file +} diff --git a/tests/test-rules.c b/tests/test-rules.c index bded4792b7..7ad4496a74 100644 --- a/tests/test-rules.c +++ b/tests/test-rules.c @@ -3315,6 +3315,30 @@ static void test_hash_module() }", blob); + uint8_t blob_tlsh[] = { + 0x54, 0x68, 0x69, 0x73, 0x20, 0x73, 0x74, 0x72, 0x69 ,0x6e, + 0x67, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6f, 0x6f, 0x6f ,0x6f, + 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f ,0x6e, + 0x67, 0x65, 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20 ,0x66, + 0x69, 0x66, 0x74, 0x79, 0x20, 0x62, 0x79, 0x74, 0x65 ,0x73, 0x2e}; // 51 bytes string without trailing zero + + assert_true_rule_blob( + "import \"hash\" \ + rule test { \ + condition: \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\") == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + }", + blob_tlsh); + assert_true_rule( "import \"hash\" \ rule test { \ @@ -3331,6 +3355,10 @@ static void test_hash_module() hash.crc32(\"TEST STRING\") == 0x51f9be31 \ and \ hash.checksum32(\"TEST STRING\") == 0x337 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + \"This string is looooooooooooonger than fifty bytes.\") == 0 \ }", NULL); @@ -3354,6 +3382,27 @@ static void test_hash_module() }", blob); + assert_true_rule_blob( + "import \"hash\" \ + rule test { \ + condition: \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + and \ + hash.tlsh_diff( \ + \"B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\") == 0 \ + and \ + hash.tlsh_diff( \ + \"B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + }", + blob_tlsh); + uint8_t multi_block_blob[] = TEXT_1024_BYTES TEXT_1024_BYTES; assert_true_rule_blob( @@ -3399,6 +3448,14 @@ static void test_hash_module() hash.crc32(0, filesize) == 0x2b11af72 \ and \ hash.crc32(\"TEST STRING\") == 0x51f9be31 \ + and \ + hash.tlsh_diff( \ + \"3741038C22D20C6FEE451103DF0C22DBC343C404F8A2880F10C22060300DE0357238F7\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"3741038C22D20C6FEE451103DF0C22DBC343C404F8A2880F10C22060300DE0357238F7\", \ + 1, filesize) != 0 \ }", multi_block_blob);