diff --git a/docs/modules/hash.rst b/docs/modules/hash.rst index 8829ed2cc8..5e62e23a2d 100644 --- a/docs/modules/hash.rst +++ b/docs/modules/hash.rst @@ -10,6 +10,14 @@ Hash module The Hash module allows you to calculate hashes (MD5, SHA1, SHA256) from portions of your file and create signatures based on those hashes. +It also allows you to work with Locality Sensitive Hashes from Trend Micro (TLSH). +Specifically, you are able to compute the distance between TLSH of the portions of +your file (min. 50 bytes) and input TLSH string. The distance scores can go up to +1000 and even above. A low score (of 50 or less) means that files are quite similar, +while the distance of zero means (very likely) the exact match. Just like MD5 and +SHA1 schemes, collisions can occur and very different files will have the same hash +value. + .. important:: This module depends on the OpenSSL library. Please refer to :ref:`compiling-yara` for information about how to build OpenSSL-dependant @@ -24,6 +32,14 @@ of your file and create signatures based on those hashes. requires the hash string to be given in lowercase, otherwise the match condition will not work. (see https://github.com/VirusTotal/yara/issues/1004) + The TLSH is not valid in lowercase. Therefore, the input hash must be in uppercase which differ + against traditional hash functions. The module accepts TLSH either with or without the first + byte "T1" specifying the version of TLSH. + + DISCLAIMER: Computing TLSH is very slow, comparable with SSDEEP hashing which means approx. + 5.4 times slower than SHA1 function. Adding `tlsh_diff` function into YARA rule can extend + its evaluation up to 15%. Be especially careful while scanning files bigger than 5 MB. + .. c:function:: md5(offset, size) Returns the MD5 hash for *size* bytes starting at *offset*. When scanning a @@ -78,3 +94,22 @@ of your file and create signatures based on those hashes. Returns a crc32 checksum for the given string. +.. c:function:: tlsh_diff(tlsh) + Computes the TLSH hash for the whole file (the offset is set to zero and + size is set to size of the file). The returned integer is the difference + between computed TLSH hash and *tlsh* hash string. + + *Example: hash.tlsh_diff("T1A4315014DC89DDDDFB6246C177B3B52BA818B01142CCF89682EACC07D800F79C64BB52") < 50* + +.. c:function:: tlsh_diff(tlsh, offset, size) + Computes the TLSH hash for the *size* bytes starting at *offset*. When + scanning a running process the *offset* argument should be a virtual address + within the process address space. The returned integer is the difference + between computed TLSH hash and *tlsh* hash string. + + *Example: hash.tlsh_diff("A4315014DC89DDDDFB6246C177B3B52BA818B01142CCF89682EACC07D800F79C64BB52", 0, filesize) == 0* + +.. c:function:: tlsh_diff(tlsh, string) + Computes the TLSH hash for the *string* of content. The returned integer + is the difference between computed TLSH hash and *tlsh* hash string. + diff --git a/libyara/include/tlshc/tlsh.h b/libyara/include/tlshc/tlsh.h index 6d8be31ede..ebe9682e87 100644 --- a/libyara/include/tlshc/tlsh.h +++ b/libyara/include/tlshc/tlsh.h @@ -55,10 +55,12 @@ void tlsh_free(Tlsh* tlsh); void tlsh_reset(Tlsh* tlsh); int tlsh_update(Tlsh* tlsh, const unsigned char* data, unsigned int len); int tlsh_final(Tlsh* tlsh, const unsigned char* data, unsigned int len, int tlsh_option); +int tlsh_total_diff(Tlsh* tlsh, Tlsh* other, bool len_diff); +int tlsh_from_tlsh_str(Tlsh* tlsh, const char* str); const char* tlsh_get_hash(Tlsh* tlsh, bool showvers); #ifdef __cplusplus } #endif -#endif // __TLSH_TLSH_H__ \ No newline at end of file +#endif // __TLSH_TLSH_H__ diff --git a/libyara/modules/hash/hash.c b/libyara/modules/hash/hash.c index 51f500768a..1e0c2232d0 100644 --- a/libyara/modules/hash/hash.c +++ b/libyara/modules/hash/hash.c @@ -27,6 +27,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -162,6 +163,174 @@ static int add_to_cache( return result; } +static Tlsh* get_tlsh_pivot(SIZED_STRING* ss_tlsh) +{ + // validation of input TLSH string + if (ss_tlsh->length != TLSH_STRING_LEN_REQ && ss_tlsh->length != TLSH_STRING_LEN_REQ-2) + return NULL; + + Tlsh* tlsh = tlsh_new(); + if (tlsh == NULL) + { + tlsh_free(tlsh); + + return NULL; + } + + if (tlsh_from_tlsh_str(tlsh, ss_string(ss_tlsh)) != 0) + { + tlsh_free(tlsh); + + return NULL; + } + + return tlsh; +} + +static bool validate_tlsh_data(const int tlsh_option, const int64_t data_len) +{ + // validation of input data + if ((tlsh_option & TLSH_OPTION_CONSERVATIVE) == 0) + { + if (data_len < MIN_DATA_LENGTH) + return false; + } + else + { + if (data_len < MIN_CONSERVATIVE_DATA_LENGTH) + return false; + } + + return true; +} + +static const char* get_tlsh_ascii(Tlsh* tlsh, const int showvers) +{ + const char* digest_ascii = tlsh_get_hash(tlsh, showvers); + if (digest_ascii && !digest_ascii[0]) + return NULL; + + return digest_ascii; +} + +static bool check_mem_block(const YR_MEMORY_BLOCK* block, const uint64_t offset, const uint64_t length, const char* func_name) +{ + if (block == NULL) + { + YR_DEBUG_FPRINTF( + 2, stderr, "} // %s() = YR_UNDEFINED // block == NULL\n", func_name); + + return false; + } + + if (offset < 0 || length < 0 || offset < block->base) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // bad offset / length\n", + func_name); + + return false; + } + + return true; +} + +static bool get_tlsh_digest( + Tlsh* tlsh, const int tlsh_option, + YR_MEMORY_BLOCK* block, YR_MEMORY_BLOCK_ITERATOR* iterator, + int64_t offset, int64_t length, + YR_OBJECT* module, const char* func_name) +{ + char* cached_ascii_digest = get_from_cache( + module, "tlsh_diff", offset, length); + + if (cached_ascii_digest != NULL) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = %s (cached)\n", + func_name, + cached_ascii_digest); + + if (tlsh_from_tlsh_str(tlsh, cached_ascii_digest) != 0) + { + + return false; + } + } + else + { + int past_first_block = false; + + foreach_memory_block(iterator, block) + { + // if desired block within current block + if (offset >= block->base && offset < block->base + block->size) + { + const uint8_t* block_data = block->fetch_data(block); + + if (block_data != NULL) + { + size_t data_offset = (size_t) (offset - block->base); + size_t data_len = (size_t) yr_min(length, block->size - data_offset); + + offset += data_len; + length -= data_len; + + if (tlsh_update(tlsh, block_data + data_offset, data_len) != 0) + { + + return false; + } + } + + past_first_block = true; + } + else if (past_first_block) + { + // If offset is not within current block and we already + // past the first block then the we are trying to compute + // the checksum over a range of non contiguous blocks. As + // range contains gaps of undefined data the checksum is + // undefined. + + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // past_first_block\n", + func_name); + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + return false; + } + + if (block->base + block->size > offset + length) + break; + } + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + if (!past_first_block) + { + YR_DEBUG_FPRINTF( + 2, + stderr, + "} // %s() = YR_UNDEFINED // !past_first_block\n", + func_name); + + tlsh_final(tlsh, NULL, 0, tlsh_option); + + return false; + } + } + + return true; +} + define_function(string_md5) { unsigned char digest[YR_MD5_LEN]; @@ -808,6 +977,224 @@ define_function(data_crc32) return_integer(checksum ^ 0xFFFFFFFF); } +define_function(file_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + + Tlsh* tlsh_pivot = get_tlsh_pivot(s); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_SCAN_CONTEXT* context = yr_scan_context(); + YR_MEMORY_BLOCK* block = first_memory_block(context); + YR_MEMORY_BLOCK_ITERATOR* iterator = context->iterator; + int64_t offset = 0; + int64_t length = block->size; + + YR_DEBUG_FPRINTF( + 2, + stderr, + "+ %s(offset=%" PRIi64 " length=%" PRIi64 " hash=%s) {\n", + func_name, + offset, + length, + ss_string(s)); + + if (!validate_tlsh_data(tlsh_option, length)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!check_mem_block(block, offset, length, __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!get_tlsh_digest(tlsh, tlsh_option, block, iterator, offset, length, yr_module(), __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR( + add_to_cache(yr_module(), "tlsh_diff", offset, length, digest_ascii)); + + YR_DEBUG_FPRINTF(2, stderr, "} // %s() = 0x%s\n", __FUNCTION__, digest_ascii); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + +define_function(data_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + int64_t arg_offset = integer_argument(2); // offset where to start + int64_t arg_length = integer_argument(3); // length of bytes we want hash on + + int64_t offset = arg_offset; + int64_t length = arg_length; + + Tlsh* tlsh_pivot = get_tlsh_pivot(s); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_SCAN_CONTEXT* context = yr_scan_context(); + YR_MEMORY_BLOCK* block = first_memory_block(context); + YR_MEMORY_BLOCK_ITERATOR* iterator = context->iterator; + + YR_DEBUG_FPRINTF( + 2, + stderr, + "+ %s(offset=%" PRIi64 " length=%" PRIi64 " hash=%s) {\n", + func_name, + offset, + length, + ss_string(s)); + + if (!validate_tlsh_data(tlsh_option, length)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!check_mem_block(block, offset, length, __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + if (!get_tlsh_digest(tlsh, tlsh_option, block, iterator, offset, length, yr_module(), __FUNCTION__)) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR( + add_to_cache(yr_module(), "tlsh_diff", arg_offset, arg_length, digest_ascii)); + + YR_DEBUG_FPRINTF(2, stderr, "} // %s() = 0x%s\n", __FUNCTION__, digest_ascii); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + +define_function(string_tlsh_diff) +{ + int tlsh_option = 0; // default option + int showvers = 1; // default is to show hash string with the version byte + bool len_diff = true; // default is to include length of the file into diff calculation + + SIZED_STRING* s1 = sized_string_argument(1); // TLSH hash string in hexa format we want to compute diff from + SIZED_STRING* s2 = sized_string_argument(2); // content string we want to compute TLSH diff from + + if (!validate_tlsh_data(tlsh_option, s2->length)) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh_pivot = get_tlsh_pivot(s1); + if (tlsh_pivot == NULL) + return_integer(YR_UNDEFINED); + + Tlsh* tlsh = tlsh_new(); + FAIL_ON_NULL_WITH_CLEANUP( + tlsh, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + FAIL_ON_ERROR_WITH_CLEANUP( + tlsh_final(tlsh, (const unsigned char*) ss_string(s2), s2->length, tlsh_option), + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + const char* digest_ascii = get_tlsh_ascii(tlsh, showvers); + FAIL_ON_NULL_WITH_CLEANUP( + digest_ascii, + tlsh_free(tlsh_pivot); tlsh_free(tlsh)); + + YR_DEBUG_FPRINTF( + 2, + stderr, + "- %s() {} = 0x%s // s2->length=%u\n", + __FUNCTION__, + digest_ascii, + s2->length); + + int diff = tlsh_total_diff(tlsh_pivot, tlsh, len_diff); + if (diff < 0) + { + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(YR_UNDEFINED); + } + + tlsh_free(tlsh_pivot); + tlsh_free(tlsh); + + return_integer(diff); +} + begin_declarations declare_function("md5", "ii", "s", data_md5); declare_function("md5", "s", "s", string_md5); @@ -823,6 +1210,10 @@ begin_declarations declare_function("crc32", "ii", "i", data_crc32); declare_function("crc32", "s", "i", string_crc32); + + declare_function("tlsh_diff", "s", "i", file_tlsh_diff); + declare_function("tlsh_diff", "sii", "i", data_tlsh_diff); + declare_function("tlsh_diff", "ss", "i", string_tlsh_diff); end_declarations int module_initialize(YR_MODULE* module) diff --git a/libyara/tlshc/tlsh.c b/libyara/tlshc/tlsh.c index 534a35b7b0..c3a7476c29 100644 --- a/libyara/tlshc/tlsh.c +++ b/libyara/tlshc/tlsh.c @@ -1,3 +1,4 @@ +#include #include #include #include "tlsh_impl.h" @@ -71,10 +72,31 @@ int tlsh_final( return 0; } + +int tlsh_total_diff(Tlsh* tlsh, Tlsh* other, bool len_diff) +{ + if (!tlsh->impl || !other || !other->impl) + return -(EINVAL); + else if (tlsh == other || tlsh_impl_compare(tlsh->impl, other->impl) == 0) + return 0; + else + return tlsh_impl_total_diff(tlsh->impl, other->impl, len_diff); +} + +int tlsh_from_tlsh_str(Tlsh* tlsh, const char* str) +{ + if (!tlsh->impl) + return -(ENOMEM); + else if (!str) + return -(EINVAL); + else + return tlsh_impl_from_tlsh_str(tlsh->impl, str); +} + const char* tlsh_get_hash(Tlsh* tlsh, bool showvers) { if (tlsh->impl) return tlsh_impl_hash(tlsh->impl, showvers); else return ""; -} \ No newline at end of file +} diff --git a/tests/test-rules.c b/tests/test-rules.c index bded4792b7..7ad4496a74 100644 --- a/tests/test-rules.c +++ b/tests/test-rules.c @@ -3315,6 +3315,30 @@ static void test_hash_module() }", blob); + uint8_t blob_tlsh[] = { + 0x54, 0x68, 0x69, 0x73, 0x20, 0x73, 0x74, 0x72, 0x69 ,0x6e, + 0x67, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6f, 0x6f, 0x6f ,0x6f, + 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f ,0x6e, + 0x67, 0x65, 0x72, 0x20, 0x74, 0x68, 0x61, 0x6e, 0x20 ,0x66, + 0x69, 0x66, 0x74, 0x79, 0x20, 0x62, 0x79, 0x74, 0x65 ,0x73, 0x2e}; // 51 bytes string without trailing zero + + assert_true_rule_blob( + "import \"hash\" \ + rule test { \ + condition: \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\") == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + }", + blob_tlsh); + assert_true_rule( "import \"hash\" \ rule test { \ @@ -3331,6 +3355,10 @@ static void test_hash_module() hash.crc32(\"TEST STRING\") == 0x51f9be31 \ and \ hash.checksum32(\"TEST STRING\") == 0x337 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + \"This string is looooooooooooonger than fifty bytes.\") == 0 \ }", NULL); @@ -3354,6 +3382,27 @@ static void test_hash_module() }", blob); + assert_true_rule_blob( + "import \"hash\" \ + rule test { \ + condition: \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"T1B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + and \ + hash.tlsh_diff( \ + \"B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\") == 0 \ + and \ + hash.tlsh_diff( \ + \"B79004053DF4C050473C01735755410FF75CC0D3171151FC44413010745113D01743D1\", \ + 1, filesize) != 0 \ + }", + blob_tlsh); + uint8_t multi_block_blob[] = TEXT_1024_BYTES TEXT_1024_BYTES; assert_true_rule_blob( @@ -3399,6 +3448,14 @@ static void test_hash_module() hash.crc32(0, filesize) == 0x2b11af72 \ and \ hash.crc32(\"TEST STRING\") == 0x51f9be31 \ + and \ + hash.tlsh_diff( \ + \"3741038C22D20C6FEE451103DF0C22DBC343C404F8A2880F10C22060300DE0357238F7\", \ + 0, filesize) == 0 \ + and \ + hash.tlsh_diff( \ + \"3741038C22D20C6FEE451103DF0C22DBC343C404F8A2880F10C22060300DE0357238F7\", \ + 1, filesize) != 0 \ }", multi_block_blob);