Projects
openEuler:24.03:SP1:Everything:64G
gcc
_service:tar_scm:0051-Port-fixes-for-IPA-prefet...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch of Package gcc
From 4c262af8e178ac7c81b32be5b159b4d09a5841c9 Mon Sep 17 00:00:00 2001 From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> Date: Fri, 8 Mar 2024 07:07:50 +0800 Subject: [PATCH 1/2] Port fixes for IPA prefetch to GCC 12 --- gcc/ipa-devirt.cc | 9 +- gcc/ipa-prefetch.cc | 174 +- gcc/ipa-sra.cc | 7 + gcc/params.opt | 4 +- gcc/testsuite/gcc.dg/completion-1.c | 1 + gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c | 1843 ++++++++++++++++++++ 6 files changed, 1974 insertions(+), 64 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc index dd3562d56..dd000b401 100644 --- a/gcc/ipa-devirt.cc +++ b/gcc/ipa-devirt.cc @@ -5029,9 +5029,12 @@ analyze_assign_stmt (gimple *stmt) } else { - fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", - get_tree_code_name (TREE_CODE (rhs))); - print_gimple_stmt (dump_file, stmt, 0); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", + get_tree_code_name (TREE_CODE (rhs))); + print_gimple_stmt (dump_file, stmt, 0); + } gcc_unreachable (); } } diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc index aeea51105..9537e4835 100644 --- a/gcc/ipa-prefetch.cc +++ b/gcc/ipa-prefetch.cc @@ -167,6 +167,7 @@ analyse_cgraph () } /* TODO: maybe remove loop info here. */ + n->get_body (); push_cfun (DECL_STRUCT_FUNCTION (n->decl)); calculate_dominance_info (CDI_DOMINATORS); loop_optimizer_init (LOOPS_NORMAL); @@ -942,6 +943,9 @@ compare_memrefs (memref_t* mr, memref_t* mr2) (*mr_candidate_map)[mr] = mr2; return; } + /* Probably we shouldn't leave nulls in the map. */ + if ((*mr_candidate_map)[mr] == NULL) + return; /* TODO: support analysis with incrementation of different fields. */ if ((*mr_candidate_map)[mr]->offset != mr2->offset) { @@ -1090,6 +1094,15 @@ analyse_loops () memref_t *mr = it->first, *mr2 = it->second; if (mr2 == NULL || !(*fmrs_map)[fn]->count (mr)) continue; + /* For now optimize only MRs that mem is MEM_REF. + TODO: support other MR types. */ + if (TREE_CODE (mr->mem) != MEM_REF) + { + if (dump_file) + fprintf (dump_file, "Skip MR %d: unsupported tree code = %s\n", + mr->mr_id, get_tree_code_name (TREE_CODE (mr->mem))); + continue; + } if (!optimize_mrs_map->count (fn)) (*optimize_mrs_map)[fn] = new memref_set; (*optimize_mrs_map)[fn]->insert (mr); @@ -1102,7 +1115,7 @@ analyse_loops () it != (*optimize_mrs_map)[fn]->end (); it++) { memref_t *mr = *it, *mr2 = (*mr_candidate_map)[mr]; - fprintf (dump_file, "MRs %d,%d with incremental offset ", + fprintf (dump_file, "MRs %d, %d with incremental offset ", mr->mr_id, mr2->mr_id); print_generic_expr (dump_file, mr2->offset); fprintf (dump_file, "\n"); @@ -1435,6 +1448,52 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data) return NULL_TREE; } +/* Copy stmt and remap its operands. */ + +static gimple * +gimple_copy_and_remap (gimple *stmt) +{ + gimple *copy = gimple_copy (stmt); + gcc_checking_assert (!is_gimple_debug (copy)); + + /* Remap all the operands in COPY. */ + struct walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = copy; + walk_gimple_op (copy, remap_gimple_op_r, &wi); + if (dump_file) + { + fprintf (dump_file, "Stmt copy after remap:\n"); + print_gimple_stmt (dump_file, copy, 0); + } + return copy; +} + +/* Copy and remap stmts listed in MR in reverse order to last_idx, skipping + processed ones. Insert new stmts to the sequence. */ + +static gimple * +gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, + int last_idx, stmt_set &processed) +{ + gimple *last_stmt = NULL; + for (int i = mr->stmts.length () - 1; i >= last_idx ; i--) + { + if (processed.count (mr->stmts[i])) + continue; + processed.insert (mr->stmts[i]); + if (dump_file) + { + fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", + i, mr->mr_id); + print_gimple_stmt (dump_file, mr->stmts[i], 0); + } + last_stmt = gimple_copy_and_remap (mr->stmts[i]); + gimple_seq_add_stmt (&stmts, last_stmt); + } + return last_stmt; +} + static void create_cgraph_edge (cgraph_node *n, gimple *stmt) { @@ -1490,6 +1549,13 @@ optimize_function (cgraph_node *n, function *fn) "Skip the case.\n"); return 0; } + if (!tree_fits_shwi_p (inc_mr->step)) + { + if (dump_file) + fprintf (dump_file, "Cannot represent incremental MR's step as " + "integer. Skip the case.\n"); + return 0; + } if (dump_file && !used_mrs.empty ()) print_mrs_ids (used_mrs, "Common list of used mrs:\n"); @@ -1539,16 +1605,44 @@ optimize_function (cgraph_node *n, function *fn) return 0; } else if (dump_file) - fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index); + { + fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index); + gimple_dump_bb (dump_file, dom_bb, 0, dump_flags); + fprintf (dump_file, "\n"); + } - split_block (dom_bb, (gimple *) NULL); + /* Try to find comp_mr's stmt in the dominator bb. */ + gimple *last_used = NULL; + for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); + gsi_prev (&si)) + if (comp_mr->stmts[0] == gsi_stmt (si)) + { + last_used = gsi_stmt (si); + if (dump_file) + { + fprintf (dump_file, "Last used stmt in dominator bb:\n"); + print_gimple_stmt (dump_file, last_used, 0); + } + break; + } + + split_block (dom_bb, last_used); gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); /* Create new inc var. Insert new_var = old_var + step * factor. */ decl_map = new tree_map; gcc_assert (comp_mr->stmts[0] && gimple_assign_single_p (comp_mr->stmts[0])); tree inc_var = gimple_assign_lhs (comp_mr->stmts[0]); + /* If old_var definition dominates the current use, just use it, otherwise + evaluate it just before new inc var evaluation. */ gimple_seq stmts = NULL; + stmt_set processed_stmts; + if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts[0]))) + { + gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, + processed_stmts); + inc_var = gimple_assign_lhs (tmp); + } tree var_type = TREE_TYPE (inc_var); enum tree_code inc_code; if (TREE_CODE (var_type) == POINTER_TYPE) @@ -1556,52 +1650,28 @@ optimize_function (cgraph_node *n, function *fn) else inc_code = PLUS_EXPR; tree step = inc_mr->step; - unsigned dist_val = tree_to_uhwi (step) * param_ipa_prefetch_distance_factor; + HOST_WIDE_INT dist_val = tree_to_shwi (step) + * param_ipa_prefetch_distance_factor; tree dist = build_int_cst (TREE_TYPE (step), dist_val); tree new_inc_var = gimple_build (&stmts, inc_code, var_type, inc_var, dist); (*decl_map)[inc_var] = new_inc_var; + if (dump_file) + { + fprintf (dump_file, "New distance value: %ld, new inc var: ", dist_val); + print_generic_expr (dump_file, new_inc_var); + fprintf (dump_file, "\n"); + } /* Create other new vars. Insert new stmts. */ - struct walk_stmt_info wi; - stmt_set processed_stmts; - memref_tree_map mr_new_trees; for (memref_set::const_iterator it = used_mrs.begin (); it != used_mrs.end (); it++) { memref_t *mr = *it; - gimple *last_stmt = NULL; if (mr == comp_mr) continue; - for (int i = mr->stmts.length () - 1; i >= 0 ; i--) - { - if (processed_stmts.count (mr->stmts[i])) - continue; - processed_stmts.insert (mr->stmts[i]); - if (dump_file) - { - fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", - i, mr->mr_id); - print_gimple_stmt (dump_file, mr->stmts[i], 0); - } - /* Create a new copy of STMT and duplicate STMT's virtual - operands. */ - gimple *copy = gimple_copy (mr->stmts[i]); - gcc_checking_assert (!is_gimple_debug (copy)); - - /* Remap all the operands in COPY. */ - memset (&wi, 0, sizeof (wi)); - last_stmt = copy; - wi.info = copy; - walk_gimple_op (copy, remap_gimple_op_r, &wi); - if (dump_file) - { - fprintf (dump_file, "Stmt %d after remap:\n",i); - print_gimple_stmt (dump_file, copy, 0); - } - gimple_seq_add_stmt (&stmts, copy); - } + gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, + processed_stmts); gcc_assert (last_stmt); - mr_new_trees[mr] = gimple_assign_lhs (last_stmt); if (dump_file) { fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id); @@ -1637,29 +1707,9 @@ optimize_function (cgraph_node *n, function *fn) memref_t *mr = vmrs[j]; /* Don't need to copy the last stmt, since we insert prefetch insn instead of it. */ - for (int i = mr->stmts.length () - 1; i >= 1 ; i--) - { - if (processed_stmts.count (mr->stmts[i])) - continue; - processed_stmts.insert (mr->stmts[i]); - - gimple *copy = gimple_copy (mr->stmts[i]); - gcc_checking_assert (!is_gimple_debug (copy)); - - /* Remap all the operands in COPY. */ - memset (&wi, 0, sizeof (wi)); - wi.info = copy; - walk_gimple_op (copy, remap_gimple_op_r, &wi); - if (dump_file) - { - fprintf (dump_file, "Stmt %d after remap:\n",i); - print_gimple_stmt (dump_file, copy, 0); - } - gimple_seq_add_stmt (&stmts, copy); - } + gimple_copy_and_remap_memref_stmts (mr, stmts, 1, processed_stmts); gimple *last_stmt = mr->stmts[0]; gcc_assert (last_stmt); - mr_new_trees[mr] = gimple_assign_lhs (last_stmt); tree write_p = mr->is_store ? integer_one_node : integer_zero_node; tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); if (decl_map->count (addr)) @@ -1668,6 +1718,11 @@ optimize_function (cgraph_node *n, function *fn) 3, addr, write_p, local); pcalls.safe_push (last_stmt); gimple_seq_add_stmt (&stmts, last_stmt); + if (dump_file) + { + fprintf (dump_file, "Insert %d prefetch stmt:\n", j); + print_gimple_stmt (dump_file, last_stmt, 0); + } } gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); @@ -1677,6 +1732,7 @@ optimize_function (cgraph_node *n, function *fn) for (unsigned i = 0; i < pcalls.length (); i++) create_cgraph_edge (n, pcalls[i]); ipa_update_overall_fn_summary (n); + renumber_gimple_stmt_uids (DECL_STRUCT_FUNCTION (n->decl)); return 1; } @@ -1806,7 +1862,7 @@ pass_ipa_prefetch::gate (function *) /* Don't bother doing anything if the program has errors. */ && !seen_error () && flag_lto_partition == LTO_PARTITION_ONE - /* Only enable struct optimizations in lto or whole_program. */ + /* Only enable prefetch optimizations in lto or whole_program. */ && (in_lto_p || flag_whole_program)); } diff --git a/gcc/ipa-sra.cc b/gcc/ipa-sra.cc index 5355cf2f4..471b3927c 100644 --- a/gcc/ipa-sra.cc +++ b/gcc/ipa-sra.cc @@ -3393,6 +3393,13 @@ param_splitting_across_edge (cgraph_edge *cs) gcc_checking_assert (from_ifs && from_ifs->m_parameters); isra_call_summary *csum = call_sums->get (cs); + /* TODO: implement better support for call edges inserted after summary + collection but before sra wpa invocation. */ + if (!csum) + { + csum = call_sums->get_create (cs); + csum->m_return_ignored = true; + } gcc_checking_assert (csum); unsigned args_count = csum->m_arg_flow.length (); isra_func_summary *to_ifs = func_sums->get (callee); diff --git a/gcc/params.opt b/gcc/params.opt index 5c07e3986..50385dfd7 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -314,8 +314,8 @@ Common Joined UInteger Var(param_ipa_prefetch_distance_factor) Init(4) Param Opt The factor represents the number of inductive variable incrementations to evaluate an indirect memory address for IPA prefetch. -param=ipa-prefetch-locality= -Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) Param Optimization -The flag represents temporal locality values in the following way: 0:pstl1strm, 1:pstl3keep, 2:pstl2keep, 3:pstl1keep. +Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) IntegerRange(0, 3) Param Optimization +The flag represents temporal locality value between 0 and 3, the higher value means the higher temporal locality in the data. -param=ira-loop-reserved-regs= Common Joined UInteger Var(param_ira_loop_reserved_regs) Init(2) Param Optimization diff --git a/gcc/testsuite/gcc.dg/completion-1.c b/gcc/testsuite/gcc.dg/completion-1.c index 64da64f1c..df2319c76 100644 --- a/gcc/testsuite/gcc.dg/completion-1.c +++ b/gcc/testsuite/gcc.dg/completion-1.c @@ -2,6 +2,7 @@ /* { dg-options "--completion=-fipa-ic" } */ /* { dg-begin-multiline-output "" } +-fipa-ic -fipa-icf -fipa-icf-functions -fipa-icf-variables diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c new file mode 100644 index 000000000..bd4fb2bdc --- /dev/null +++ b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c @@ -0,0 +1,1843 @@ +/* { dg-do link } */ +/* { dg-options "-O3 -fipa-ic -fipa-prefetch -flto -flto-partition=one -fdump-ipa-ipa_prefetch -fdump-ipa-icp" } */ +/* { dg-require-effective-target lto } */ + +/* Based on opensource xz code. */ + +#include <stdlib.h> +#include <string.h> + +typedef long int ptrdiff_t; +typedef long unsigned int size_t; +typedef unsigned int wchar_t; + +typedef unsigned char __u_char; +typedef unsigned short int __u_short; +typedef unsigned int __u_int; +typedef unsigned long int __u_long; + +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; + +typedef signed long int __int64_t; +typedef unsigned long int __uint64_t; + +typedef __int8_t __int_least8_t; +typedef __uint8_t __uint_least8_t; +typedef __int16_t __int_least16_t; +typedef __uint16_t __uint_least16_t; +typedef __int32_t __int_least32_t; +typedef __uint32_t __uint_least32_t; +typedef __int64_t __int_least64_t; +typedef __uint64_t __uint_least64_t; + +typedef __int8_t int8_t; +typedef __int16_t int16_t; +typedef __int32_t int32_t; +typedef __int64_t int64_t; + +typedef __uint8_t uint8_t; +typedef __uint16_t uint16_t; +typedef __uint32_t uint32_t; +typedef __uint64_t uint64_t; + +typedef long int intptr_t; +typedef unsigned long int uintptr_t; + +static inline uint16_t +read16ne(const uint8_t *buf) +{ + uint16_t num; + memcpy(&num, buf, sizeof(num)); + return num; +} + +static inline uint32_t +read32ne(const uint8_t *buf) +{ + uint32_t num; + memcpy(&num, buf, sizeof(num)); + return num; +} + +static inline uint16_t +aligned_read16ne(const uint8_t *buf) +{ + uint16_t num; + memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); + return num; +} + + +static inline uint32_t +aligned_read32ne(const uint8_t *buf) +{ + uint32_t num; + memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); + return num; +} + +static inline uint64_t +aligned_read64ne(const uint8_t *buf) +{ + uint64_t num; + memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); + return num; +} + +typedef unsigned char lzma_bool; + +typedef enum { + LZMA_RESERVED_ENUM = 0 +} lzma_reserved_enum; + +typedef enum { + LZMA_OK = 0, + LZMA_STREAM_END = 1, + LZMA_NO_CHECK = 2, + LZMA_UNSUPPORTED_CHECK = 3, + LZMA_GET_CHECK = 4, + LZMA_MEM_ERROR = 5, + LZMA_MEMLIMIT_ERROR = 6, + LZMA_FORMAT_ERROR = 7, + LZMA_OPTIONS_ERROR = 8, + LZMA_DATA_ERROR = 9, + LZMA_BUF_ERROR = 10, + LZMA_PROG_ERROR = 11, +} lzma_ret; + +typedef enum { + LZMA_RUN = 0, + LZMA_SYNC_FLUSH = 1, + LZMA_FULL_FLUSH = 2, + LZMA_FULL_BARRIER = 4, + LZMA_FINISH = 3 +} lzma_action; + +typedef struct { + void *( *alloc)(void *opaque, size_t nmemb, size_t size); + + void ( *free)(void *opaque, void *ptr); + + void *opaque; +} lzma_allocator; + +typedef uint64_t lzma_vli; + +typedef enum { + LZMA_CHECK_NONE = 0, + LZMA_CHECK_CRC32 = 1, + LZMA_CHECK_CRC64 = 4, + LZMA_CHECK_SHA256 = 10 +} lzma_check; + +typedef struct { + lzma_vli id; + void *options; +} lzma_filter; + +typedef enum { + LZMA_MF_HC3 = 0x03, + LZMA_MF_HC4 = 0x04, + LZMA_MF_BT2 = 0x12, + LZMA_MF_BT3 = 0x13, + LZMA_MF_BT4 = 0x14 +} lzma_match_finder; + +typedef struct lzma_next_coder_s lzma_next_coder; + +typedef struct lzma_filter_info_s lzma_filter_info; + +typedef lzma_ret (*lzma_init_function)( + lzma_next_coder *next, const lzma_allocator *allocator, + const lzma_filter_info *filters); + +typedef lzma_ret (*lzma_code_function)( + void *coder, const lzma_allocator *allocator, + const uint8_t *restrict in, size_t *restrict in_pos, + size_t in_size, uint8_t *restrict out, + size_t *restrict out_pos, size_t out_size, + lzma_action action); + +typedef void (*lzma_end_function)( + void *coder, const lzma_allocator *allocator); + +struct lzma_filter_info_s { + lzma_vli id; + lzma_init_function init; + void *options; +}; + +struct lzma_next_coder_s { + void *coder; + lzma_vli id; + uintptr_t init; + + lzma_code_function code; + lzma_end_function end; + void (*get_progress)(void *coder, + uint64_t *progress_in, uint64_t *progress_out); + + lzma_check (*get_check)(const void *coder); + lzma_ret (*memconfig)(void *coder, uint64_t *memusage, + uint64_t *old_memlimit, uint64_t new_memlimit); + lzma_ret (*update)(void *coder, const lzma_allocator *allocator, + const lzma_filter *filters, const lzma_filter *reversed_filters); +}; + +typedef struct { + uint32_t len; + uint32_t dist; +} lzma_match; + +typedef struct lzma_mf_s lzma_mf; +struct lzma_mf_s { + uint8_t *buffer; + uint32_t size; + uint32_t keep_size_before; + uint32_t keep_size_after; + uint32_t offset; + uint32_t read_pos; + uint32_t read_ahead; + uint32_t read_limit; + uint32_t write_pos; + uint32_t pending; + uint32_t (*find)(lzma_mf *mf, lzma_match *matches); + void (*skip)(lzma_mf *mf, uint32_t num); + uint32_t *hash; + uint32_t *son; + uint32_t cyclic_pos; + uint32_t cyclic_size; + uint32_t hash_mask; + uint32_t depth; + uint32_t nice_len; + uint32_t match_len_max; + lzma_action action; + uint32_t hash_count; + uint32_t sons_count; +}; + +typedef struct { + size_t before_size; + size_t dict_size; + size_t after_size; + size_t match_len_max; + size_t nice_len; + lzma_match_finder match_finder; + uint32_t depth; + const uint8_t *preset_dict; + uint32_t preset_dict_size; +} lzma_lz_options; + +typedef struct { + void *coder; + lzma_ret (*code)(void *coder, + lzma_mf *restrict mf, uint8_t *restrict out, + size_t *restrict out_pos, size_t out_size); + void (*end)(void *coder, const lzma_allocator *allocator); + lzma_ret (*options_update)(void *coder, const lzma_filter *filter); +} lzma_lz_encoder; + +static inline const uint8_t * +mf_ptr(const lzma_mf *mf) +{ + return mf->buffer + mf->read_pos; +} + +static inline uint32_t +mf_avail(const lzma_mf *mf) +{ + return mf->write_pos - mf->read_pos; +} + +typedef struct { + uint32_t state[8]; + uint64_t size; +} lzma_sha256_state; + +typedef struct { + union { + uint8_t u8[64]; + uint32_t u32[16]; + uint64_t u64[8]; + } buffer; + union { + uint32_t crc32; + uint64_t crc64; + lzma_sha256_state sha256; + } state; +} lzma_check_state; + +// The table is constantly initialized in the original code. +// Skip it in the test. +const uint32_t lzma_crc32_table[8][256]; + +static inline uint32_t __attribute__((__always_inline__)) +lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, + uint32_t len, uint32_t limit) +{ + while (len < limit) { + uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len); + if (x != 0) { + if ((x & 0xFFFF) == 0) { + len += 2; + x >>= 16; + } + + if ((x & 0xFF) == 0) + ++len; + + return ((len) < (limit) ? (len) : (limit)); + } + + len += 4; + } + + return limit; +} + +extern uint32_t +lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches) +{ + const uint32_t count = mf->find(mf, matches); + uint32_t len_best = 0; + + if (count > 0) { + len_best = matches[count - 1].len; + if (len_best == mf->nice_len) { + uint32_t limit = mf_avail(mf) + 1; + if (limit > mf->match_len_max) + limit = mf->match_len_max; + const uint8_t *p1 = mf_ptr(mf) - 1; + const uint8_t *p2 = p1 - matches[count - 1].dist - 1; + len_best = lzma_memcmplen(p1, p2, len_best, limit); + } + } + + *count_ptr = count; + ++mf->read_ahead; + + return len_best; +} + +static void +normalize(lzma_mf *mf) +{ + const uint32_t subvalue = ((4294967295U) - mf->cyclic_size); + + for (uint32_t i = 0; i < mf->hash_count; ++i) { + if (mf->hash[i] <= subvalue) + mf->hash[i] = 0; + else + mf->hash[i] -= subvalue; + } + + for (uint32_t i = 0; i < mf->sons_count; ++i) { + if (mf->son[i] <= subvalue) + mf->son[i] = 0; + else + mf->son[i] -= subvalue; + } + + mf->offset -= subvalue; + return; +} + +static void +move_pos(lzma_mf *mf) +{ + if (++mf->cyclic_pos == mf->cyclic_size) + mf->cyclic_pos = 0; + ++mf->read_pos; + if (__builtin_expect(mf->read_pos + mf->offset == (4294967295U), 0 )) + normalize(mf); +} + +static void +move_pending(lzma_mf *mf) +{ + ++mf->read_pos; + ++mf->pending; +} + +static lzma_match * +hc_find_func( + const uint32_t len_limit, + const uint32_t pos, + const uint8_t *const cur, + uint32_t cur_match, + uint32_t depth, + uint32_t *const son, + const uint32_t cyclic_pos, + const uint32_t cyclic_size, + lzma_match *matches, + uint32_t len_best) +{ + son[cyclic_pos] = cur_match; + + while (1) { + const uint32_t delta = pos - cur_match; + if (depth-- == 0 || delta >= cyclic_size) + return matches; + + const uint8_t *const pb = cur - delta; + cur_match = son[cyclic_pos - delta + + (delta > cyclic_pos ? cyclic_size : 0)]; + + if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) { + uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit); + + if (len_best < len) { + len_best = len; + matches->len = len; + matches->dist = delta - 1; + ++matches; + + if (len == len_limit) + return matches; + } + } + } +} + +extern uint32_t +lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches) +{ + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (3)) { + move_pending(mf); + return 0; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + uint32_t matches_count = 0; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; + + const uint32_t delta2 = pos - mf->hash[hash_2_value]; + const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_value] = pos; + + uint32_t len_best = 2; + + if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { + len_best = lzma_memcmplen(cur - delta2, cur, len_best, len_limit); + + matches[0].len = len_best; + matches[0].dist = delta2 - 1; + matches_count = 1; + + if (len_best == len_limit) { + mf->son[mf->cyclic_pos] = cur_match; + move_pos(mf); + return 1; + } + } + + matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth, + mf->son, mf->cyclic_pos, mf->cyclic_size, + matches + matches_count, len_best) - matches; + move_pos(mf); + return matches_count; +} + +extern void +lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount) +{ + do { + if (mf_avail(mf) < 3) { + move_pending(mf); + continue; + } + + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; + + const uint32_t cur_match + = mf->hash[((1U << 10)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_value] = pos; + + do { mf->son[mf->cyclic_pos] = cur_match; move_pos(mf); } while (0); + + } while (--amount != 0); +} + +extern uint32_t +lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches) +{ + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (4)) { + move_pending(mf); + return 0; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + uint32_t matches_count = 0; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) + & ((1U << 16) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) + ^ (lzma_crc32_table[0][cur[3]] << 5)) + & mf->hash_mask; + uint32_t delta2 = pos - mf->hash[hash_2_value]; + const uint32_t delta3 + = pos - mf->hash[((1U << 10)) + hash_3_value]; + const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; + + mf->hash[hash_2_value ] = pos; + mf->hash[((1U << 10)) + hash_3_value] = pos; + mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; + + uint32_t len_best = 1; + + if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { + len_best = 2; + matches[0].len = 2; + matches[0].dist = delta2 - 1; + matches_count = 1; + } + + if (delta2 != delta3 && delta3 < mf->cyclic_size + && *(cur - delta3) == *cur) { + len_best = 3; + matches[matches_count++].dist = delta3 - 1; + delta2 = delta3; + } + + if (matches_count != 0) { + len_best = lzma_memcmplen(cur - delta2, cur, + len_best, len_limit); + + matches[matches_count - 1].len = len_best; + + if (len_best == len_limit) { + mf->son[mf->cyclic_pos] = cur_match; move_pos(mf); + return matches_count; + } + } + + if (len_best < 3) + len_best = 3; + + matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth, + mf->son, mf->cyclic_pos, mf->cyclic_size, + matches + matches_count, len_best) - matches; + move_pos(mf); + return matches_count; +} + +extern void +lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount) +{ + do { + if (mf_avail(mf) < 4) { + move_pending(mf); + continue; + } + + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) + ^ (lzma_crc32_table[0][cur[3]] << 5)) + & mf->hash_mask; + + const uint32_t cur_match + = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_3_value] = pos; + mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; + + mf->son[mf->cyclic_pos] = cur_match; + move_pos(mf); + } while (--amount != 0); +} + +static lzma_match * +bt_find_func( + const uint32_t len_limit, + const uint32_t pos, + const uint8_t *const cur, + uint32_t cur_match, + uint32_t depth, + uint32_t *const son, + const uint32_t cyclic_pos, + const uint32_t cyclic_size, + lzma_match *matches, + uint32_t len_best) +{ + uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; + uint32_t *ptr1 = son + (cyclic_pos << 1); + + uint32_t len0 = 0; + uint32_t len1 = 0; + + while (1) { + const uint32_t delta = pos - cur_match; + if (depth-- == 0 || delta >= cyclic_size) { + *ptr0 = 0; + *ptr1 = 0; + return matches; + } + + uint32_t *const pair = son + ((cyclic_pos - delta + + (delta > cyclic_pos ? cyclic_size : 0)) + << 1); + + const uint8_t *const pb = cur - delta; + uint32_t len = ((len0) < (len1) ? (len0) : (len1)); + + if (pb[len] == cur[len]) { + len = lzma_memcmplen(pb, cur, len + 1, len_limit); + + if (len_best < len) { + len_best = len; + matches->len = len; + matches->dist = delta - 1; + ++matches; + + if (len == len_limit) { + *ptr1 = pair[0]; + *ptr0 = pair[1]; + return matches; + } + } + } + + if (pb[len] < cur[len]) { + *ptr1 = cur_match; + ptr1 = pair + 1; + cur_match = *ptr1; + len1 = len; + } else { + *ptr0 = cur_match; + ptr0 = pair; + cur_match = *ptr0; + len0 = len; + } + } +} + + +static void +bt_skip_func( + const uint32_t len_limit, + const uint32_t pos, + const uint8_t *const cur, + uint32_t cur_match, + uint32_t depth, + uint32_t *const son, + const uint32_t cyclic_pos, + const uint32_t cyclic_size) +{ + uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; + uint32_t *ptr1 = son + (cyclic_pos << 1); + + uint32_t len0 = 0; + uint32_t len1 = 0; + + while (1) { + const uint32_t delta = pos - cur_match; + if (depth-- == 0 || delta >= cyclic_size) { + *ptr0 = 0; + *ptr1 = 0; + return; + } + + uint32_t *pair = son + ((cyclic_pos - delta + + (delta > cyclic_pos ? cyclic_size : 0)) + << 1); + const uint8_t *pb = cur - delta; + uint32_t len = ((len0) < (len1) ? (len0) : (len1)); + + if (pb[len] == cur[len]) { + len = lzma_memcmplen(pb, cur, len + 1, len_limit); + + if (len == len_limit) { + *ptr1 = pair[0]; + *ptr0 = pair[1]; + return; + } + } + + if (pb[len] < cur[len]) { + *ptr1 = cur_match; + ptr1 = pair + 1; + cur_match = *ptr1; + len1 = len; + } else { + *ptr0 = cur_match; + ptr0 = pair; + cur_match = *ptr0; + len0 = len; + } + } +} + +extern uint32_t +lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches) +{ + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) { + move_pending(mf); + return 0; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + uint32_t matches_count = 0; + const uint32_t hash_value = read16ne(cur); + const uint32_t cur_match = mf->hash[hash_value]; + mf->hash[hash_value] = pos; + + matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, + mf->son, mf->cyclic_pos, mf->cyclic_size, + matches + matches_count, 1) - matches; + move_pos(mf); + return matches_count; +} + +extern void +lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount) +{ + do { + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) { + move_pending(mf); + continue; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + + const uint32_t hash_value = read16ne(cur); + const uint32_t cur_match = mf->hash[hash_value]; + mf->hash[hash_value] = pos; + + bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size); + move_pos(mf); + } while (--amount != 0); +} + +extern uint32_t +lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches) +{ + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { + move_pending(mf); + return 0; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + uint32_t matches_count = 0; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; + + const uint32_t delta2 = pos - mf->hash[hash_2_value]; + const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_value] = pos; + + uint32_t len_best = 2; + + if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { + len_best = lzma_memcmplen( + cur, cur - delta2, len_best, len_limit); + + matches[0].len = len_best; + matches[0].dist = delta2 - 1; + matches_count = 1; + + if (len_best == len_limit) { + bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size); + move_pos(mf); + return 1; + } + } + + matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, + mf->son, mf->cyclic_pos, mf->cyclic_size, + matches + matches_count, len_best) - matches; + move_pos(mf); + return matches_count; +} + + +extern void +lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount) +{ + do { + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; } + else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { + move_pending(mf); + continue; + } + const uint8_t *cur = mf_ptr(mf); + const uint32_t pos = mf->read_pos + mf->offset; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; + + const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_value] = pos; + + bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size); + move_pos(mf); + } while (--amount != 0); +} + +extern uint32_t +lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches) +{ + uint32_t len_limit = mf->write_pos - mf->read_pos; + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) { + ++mf->read_pos; + ++mf->pending; + return 0; + } + + const uint8_t *cur = mf->buffer + mf->read_pos; + const uint32_t pos = mf->read_pos + mf->offset; + uint32_t matches_count = 0; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) + ^ (lzma_crc32_table[0][cur[3]] << 5)) + & mf->hash_mask; + + uint32_t delta2 = pos - mf->hash[hash_2_value]; + const uint32_t delta3 = pos - mf->hash[((1U << 10)) + hash_3_value]; + const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_3_value] = pos; + mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; + + uint32_t len_best = 1; + + if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { + len_best = 2; + matches[0].len = 2; + matches[0].dist = delta2 - 1; + matches_count = 1; + } + + if (delta2 != delta3 && delta3 < mf->cyclic_size && *(cur - delta3) == *cur) { + len_best = 3; + matches[matches_count++].dist = delta3 - 1; + delta2 = delta3; + } + + if (matches_count != 0) { + len_best = lzma_memcmplen(cur, cur - delta2, len_best, len_limit); + + matches[matches_count - 1].len = len_best; + + if (len_best == len_limit) { + bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size); + move_pos(mf); + return matches_count; + } + } + + if (len_best < 3) + len_best = 3; + + matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size, + matches + matches_count, len_best) - matches; + move_pos(mf); + return matches_count; +} + +extern void +lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount) +{ + do { + uint32_t len_limit = mf_avail(mf); + if (mf->nice_len <= len_limit) { + len_limit = mf->nice_len; + } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) { + move_pending(mf); + continue; + } + + const uint8_t *cur = mf->buffer + mf->read_pos; + const uint32_t pos = mf->read_pos + mf->offset; + + const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; + const uint32_t hash_2_value = temp & ((1U << 10) - 1); + const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) + & ((1U << 16) - 1); + const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) + ^ (lzma_crc32_table[0][cur[3]] << 5)) + & mf->hash_mask; + + const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; + + mf->hash[hash_2_value] = pos; + mf->hash[((1U << 10)) + hash_3_value] = pos; + mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; + + bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, + mf->cyclic_pos, mf->cyclic_size); + move_pos(mf); + } while (--amount != 0); +} + +static inline void +mf_skip(lzma_mf *mf, uint32_t amount) +{ + if (amount != 0) { + mf->skip(mf, amount); + mf->read_ahead += amount; + } +} + +typedef struct lzma_lzma1_encoder_s lzma_lzma1_encoder; +typedef uint16_t probability; + +typedef struct { + probability choice; + probability choice2; + probability low[(1 << 4)][(1 << 3)]; + probability mid[(1 << 4)][(1 << 3)]; + probability high[(1 << 8)]; + uint32_t prices[(1 << 4)][((1 << 3) + (1 << 3) + (1 << 8))]; + uint32_t table_size; + uint32_t counters[(1 << 4)]; +} lzma_length_encoder; + +typedef struct { + uint64_t low; + uint64_t cache_size; + uint32_t range; + uint8_t cache; + size_t count; + size_t pos; + + enum { + RC_BIT_0, + RC_BIT_1, + RC_DIRECT_0, + RC_DIRECT_1, + RC_FLUSH, + } symbols[58]; + + probability *probs[58]; +} lzma_range_encoder; + + +typedef enum { + STATE_LIT_LIT, + STATE_MATCH_LIT_LIT, + STATE_REP_LIT_LIT, + STATE_SHORTREP_LIT_LIT, + STATE_MATCH_LIT, + STATE_REP_LIT, + STATE_SHORTREP_LIT, + STATE_LIT_MATCH, + STATE_LIT_LONGREP, + STATE_LIT_SHORTREP, + STATE_NONLIT_MATCH, + STATE_NONLIT_REP, +} lzma_lzma_state; + +typedef struct { + lzma_lzma_state state; + _Bool prev_1_is_literal; + _Bool prev_2; + + uint32_t pos_prev_2; + uint32_t back_prev_2; + + uint32_t price; + uint32_t pos_prev; + uint32_t back_prev; + + uint32_t backs[4]; +} lzma_optimal; + +struct lzma_lzma1_encoder_s { + lzma_range_encoder rc; + lzma_lzma_state state; + uint32_t reps[4]; + lzma_match matches[(2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1) + 1]; + uint32_t matches_count; + uint32_t longest_match_length; + _Bool fast_mode; + _Bool is_initialized; + _Bool is_flushed; + uint32_t pos_mask; + uint32_t literal_context_bits; + uint32_t literal_pos_mask; + + probability literal[(1 << 4)][0x300]; + probability is_match[12][(1 << 4)]; + probability is_rep[12]; + probability is_rep0[12]; + probability is_rep1[12]; + probability is_rep2[12]; + probability is_rep0_long[12][(1 << 4)]; + probability dist_slot[4][(1 << 6)]; + probability dist_special[(1 << (14 / 2)) - 14]; + probability dist_align[(1 << 4)]; + + lzma_length_encoder match_len_encoder; + lzma_length_encoder rep_len_encoder; + + uint32_t dist_slot_prices[4][(1 << 6)]; + uint32_t dist_prices[4][(1 << (14 / 2))]; + uint32_t dist_table_size; + uint32_t match_price_count; + + uint32_t align_prices[(1 << 4)]; + uint32_t align_price_count; + uint32_t opts_end_index; + uint32_t opts_current_index; + lzma_optimal opts[(1 << 12)]; +}; + +extern void +lzma_lzma_optimum_fast(lzma_lzma1_encoder *restrict coder, + lzma_mf *restrict mf, + uint32_t *restrict back_res, uint32_t *restrict len_res) +{ + const uint32_t nice_len = mf->nice_len; + + uint32_t len_main; + uint32_t matches_count; + if (mf->read_ahead == 0) { + len_main = lzma_mf_find(mf, &matches_count, coder->matches); + } else { + len_main = coder->longest_match_length; + matches_count = coder->matches_count; + } + + const uint8_t *buf = mf_ptr(mf) - 1; + const uint32_t buf_avail + = ((mf_avail(mf) + 1) < ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1)) + ? (mf_avail(mf) + 1) : ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1))); + + if (buf_avail < 2) { + *back_res = (4294967295U); + *len_res = 1; + return; + } + + uint32_t rep_len = 0; + uint32_t rep_index = 0; + + for (uint32_t i = 0; i < 4; ++i) { + const uint8_t *const buf_back = buf - coder->reps[i] - 1; + if ((read16ne(buf) != read16ne(buf_back))) + continue; + const uint32_t len = lzma_memcmplen(buf, buf_back, 2, buf_avail); + if (len >= nice_len) { + *back_res = i; + *len_res = len; + mf_skip(mf, len - 1); + return; + } + if (len > rep_len) { + rep_index = i; + rep_len = len; + } + } + if (len_main >= nice_len) { + *back_res = coder->matches[matches_count - 1].dist + 4; + *len_res = len_main; + mf_skip(mf, len_main - 1); + return; + } + + uint32_t back_main = 0; + if (len_main >= 2) { + back_main = coder->matches[matches_count - 1].dist; + while (matches_count > 1 && len_main == + coder->matches[matches_count - 2].len + 1) { + if (!(((back_main) >> 7) > (coder->matches[ matches_count - 2].dist))) + break; + --matches_count; + len_main = coder->matches[matches_count - 1].len; + back_main = coder->matches[matches_count - 1].dist; + } + if (len_main == 2 && back_main >= 0x80) + len_main = 1; + } + + if (rep_len >= 2) { + if (rep_len + 1 >= len_main + || (rep_len + 2 >= len_main + && back_main > (1U << 9)) + || (rep_len + 3 >= len_main + && back_main > (1U << 15))) { + *back_res = rep_index; + *len_res = rep_len; + mf_skip(mf, rep_len - 1); + return; + } + } + + if (len_main < 2 || buf_avail <= 2) { + *back_res = (4294967295U); + *len_res = 1; + return; + } + + coder->longest_match_length = lzma_mf_find(mf, + &coder->matches_count, coder->matches); + + if (coder->longest_match_length >= 2) { + const uint32_t new_dist = coder->matches[ + coder->matches_count - 1].dist; + + if ((coder->longest_match_length >= len_main + && new_dist < back_main) + || (coder->longest_match_length == len_main + 1 + && !(((new_dist) >> 7) > (back_main))) + || (coder->longest_match_length > len_main + 1) + || (coder->longest_match_length + 1 >= len_main + && len_main >= 3 + && (((back_main) >> 7) > (new_dist)))) { + *back_res = (4294967295U); + *len_res = 1; + return; + } + } + ++buf; + const uint32_t limit = ((2) > (len_main - 1) ? (2) : (len_main - 1)); + for (uint32_t i = 0; i < 4; ++i) { + if (memcmp(buf, buf - coder->reps[i] - 1, limit) == 0) { + *back_res = (4294967295U); + *len_res = 1; + return; + } + } + + *back_res = back_main + 4; + *len_res = len_main; + mf_skip(mf, len_main - 2); + return; +} + +static inline void +rc_bit(lzma_range_encoder *rc, probability *prob, uint32_t bit) +{ + rc->symbols[rc->count] = bit; + rc->probs[rc->count] = prob; + ++rc->count; +} + +static inline void +rc_bittree(lzma_range_encoder *rc, probability *probs, + uint32_t bit_count, uint32_t symbol) +{ + uint32_t model_index = 1; + + do { + const uint32_t bit = (symbol >> --bit_count) & 1; + rc_bit(rc, &probs[model_index], bit); + model_index = (model_index << 1) + bit; + } while (bit_count != 0); +} + +static _Bool +encode_init(lzma_lzma1_encoder *coder, lzma_mf *mf) +{ + if (mf->read_pos == mf->read_limit) { + if (mf->action == LZMA_RUN) + return 0; + } else { + mf_skip(mf, 1); + mf->read_ahead = 0; + rc_bit(&coder->rc, &coder->is_match[0][0], 0); + rc_bittree(&coder->rc, coder->literal[0], 8, mf->buffer[0]); + } + + coder->is_initialized = 1; + + return 1; +} + +static inline uint32_t +mf_position(const lzma_mf *mf) +{ + return mf->read_pos - mf->read_ahead; +} + +static inline _Bool +rc_shift_low(lzma_range_encoder *rc, + uint8_t *out, size_t *out_pos, size_t out_size) +{ + if ((uint32_t)(rc->low) < (uint32_t)(0xFF000000) + || (uint32_t)(rc->low >> 32) != 0) { + do { + if (*out_pos == out_size) + return 1; + + out[*out_pos] = rc->cache + (uint8_t)(rc->low >> 32); + ++*out_pos; + rc->cache = 0xFF; + } while (--rc->cache_size != 0); + rc->cache = (rc->low >> 24) & 0xFF; + } + + ++rc->cache_size; + rc->low = (rc->low & 0x00FFFFFF) << 8; + return 0; +} + +static inline void +rc_reset(lzma_range_encoder *rc) +{ + rc->low = 0; + rc->cache_size = 1; + rc->range = (4294967295U); + rc->cache = 0; + rc->count = 0; + rc->pos = 0; +} + +static inline _Bool +rc_encode(lzma_range_encoder *rc, + uint8_t *out, size_t *out_pos, size_t out_size) +{ + while (rc->pos < rc->count) { + if (rc->range < (1U << 24)) { + if (rc_shift_low(rc, out, out_pos, out_size)) + return 1; + rc->range <<= 8; + } + + switch (rc->symbols[rc->pos]) { + case RC_BIT_0: { + probability prob = *rc->probs[rc->pos]; + rc->range = (rc->range >> 11) + * prob; + prob += ((1U << 11) - prob) >> 5; + *rc->probs[rc->pos] = prob; + break; + } + + case RC_BIT_1: { + probability prob = *rc->probs[rc->pos]; + const uint32_t bound = prob * (rc->range + >> 11); + rc->low += bound; + rc->range -= bound; + prob -= prob >> 5; + *rc->probs[rc->pos] = prob; + break; + } + + case RC_DIRECT_0: + rc->range >>= 1; + break; + + case RC_DIRECT_1: + rc->range >>= 1; + rc->low += rc->range; + break; + + case RC_FLUSH: + rc->range = (4294967295U); + do { + if (rc_shift_low(rc, out, out_pos, out_size)) + return 1; + } while (++rc->pos < rc->count); + + rc_reset(rc); + return 0; + + default: + break; + } + ++rc->pos; + } + + rc->count = 0; + rc->pos = 0; + return 0; +} + +static inline uint64_t +rc_pending(const lzma_range_encoder *rc) +{ + return rc->cache_size + 5 - 1; +} + +static inline void +literal_matched(lzma_range_encoder *rc, probability *subcoder, + uint32_t match_byte, uint32_t symbol) +{ + uint32_t offset = 0x100; + symbol += 1U << 8; + + do { + match_byte <<= 1; + const uint32_t match_bit = match_byte & offset; + const uint32_t subcoder_index + = offset + match_bit + (symbol >> 8); + const uint32_t bit = (symbol >> 7) & 1; + rc_bit(rc, &subcoder[subcoder_index], bit); + + symbol <<= 1; + offset &= ~(match_byte ^ symbol); + + } while (symbol < (1U << 16)); +} + +static inline void +literal(lzma_lzma1_encoder *coder, lzma_mf *mf, uint32_t position) +{ + const uint8_t cur_byte = mf->buffer[mf->read_pos - mf->read_ahead]; + probability *subcoder = ((coder->literal)[ + (((position) & (coder->literal_pos_mask)) + << (coder->literal_context_bits)) + + ((uint32_t)(mf->buffer[mf->read_pos - mf->read_ahead - 1]) + >> (8U - (coder->literal_context_bits)))]); + + if (((coder->state) < 7)) { + rc_bittree(&coder->rc, subcoder, 8, cur_byte); + } else { + const uint8_t match_byte + = mf->buffer[mf->read_pos - coder->reps[0] - 1 - mf->read_ahead]; + literal_matched(&coder->rc, subcoder, match_byte, cur_byte); + } + coder->state + = ((coder->state) <= STATE_SHORTREP_LIT_LIT + ? STATE_LIT_LIT : ((coder->state) <= STATE_LIT_SHORTREP + ? (coder->state) - 3 : (coder->state) - 6)); +} + +const uint8_t lzma_rc_prices[] = { + 128, 103, 91, 84, 78, 73, 69, 66, + 63, 61, 58, 56, 54, 52, 51, 49, + 48, 46, 45, 44, 43, 42, 41, 40, + 39, 38, 37, 36, 35, 34, 34, 33, + 32, 31, 31, 30, 29, 29, 28, 28, + 27, 26, 26, 25, 25, 24, 24, 23, + 23, 22, 22, 22, 21, 21, 20, 20, + 19, 19, 19, 18, 18, 17, 17, 17, + 16, 16, 16, 15, 15, 15, 14, 14, + 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 7, 6, 6, 6, 6, 5, + 5, 5, 5, 5, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 2, 2, 2, + 2, 2, 2, 1, 1, 1, 1, 1 +}; + +static inline uint32_t +rc_bit_price(const probability prob, const uint32_t bit) +{ + return lzma_rc_prices[(prob ^ ((0U - bit) + & ((1U << 11) - 1))) >> 4]; +} + +static inline uint32_t +rc_bit_0_price(const probability prob) +{ + return lzma_rc_prices[prob >> 4]; +} + +static inline uint32_t +rc_bit_1_price(const probability prob) +{ + return lzma_rc_prices[(prob ^ ((1U << 11) - 1)) + >> 4]; +} + +static inline uint32_t +rc_bittree_price(const probability *const probs, + const uint32_t bit_levels, uint32_t symbol) +{ + uint32_t price = 0; + symbol += 1U << bit_levels; + + do { + const uint32_t bit = symbol & 1; + symbol >>= 1; + price += rc_bit_price(probs[symbol], bit); + } while (symbol != 1); + + return price; +} + +static void +length_update_prices(lzma_length_encoder *lc, const uint32_t pos_state) +{ + const uint32_t table_size = lc->table_size; + lc->counters[pos_state] = table_size; + + const uint32_t a0 = rc_bit_0_price(lc->choice); + const uint32_t a1 = rc_bit_1_price(lc->choice); + const uint32_t b0 = a1 + rc_bit_0_price(lc->choice2); + const uint32_t b1 = a1 + rc_bit_1_price(lc->choice2); + uint32_t *const prices = lc->prices[pos_state]; + + uint32_t i; + for (i = 0; i < table_size && i < (1 << 3); ++i) + prices[i] = a0 + rc_bittree_price(lc->low[pos_state], + 3, i); + + for (; i < table_size && i < (1 << 3) + (1 << 3); ++i) + prices[i] = b0 + rc_bittree_price(lc->mid[pos_state], + 3, i - (1 << 3)); + + for (; i < table_size; ++i) + prices[i] = b1 + rc_bittree_price(lc->high, 8, + i - (1 << 3) - (1 << 3)); + + return; +} + +static inline void +length(lzma_range_encoder *rc, lzma_length_encoder *lc, + const uint32_t pos_state, uint32_t len, const _Bool fast_mode) +{ + len -= 2; + + if (len < (1 << 3)) { + rc_bit(rc, &lc->choice, 0); + rc_bittree(rc, lc->low[pos_state], 3, len); + } else { + rc_bit(rc, &lc->choice, 1); + len -= (1 << 3); + + if (len < (1 << 3)) { + rc_bit(rc, &lc->choice2, 0); + rc_bittree(rc, lc->mid[pos_state], 3, len); + } else { + rc_bit(rc, &lc->choice2, 1); + len -= (1 << 3); + rc_bittree(rc, lc->high, 8, len); + } + } + + if (!fast_mode) + if (--lc->counters[pos_state] == 0) + length_update_prices(lc, pos_state); +} + +static inline void +rep_match(lzma_lzma1_encoder *coder, const uint32_t pos_state, + const uint32_t rep, const uint32_t len) +{ + if (rep == 0) { + rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0); + rc_bit(&coder->rc, + &coder->is_rep0_long[coder->state][pos_state], + len != 1); + } else { + const uint32_t distance = coder->reps[rep]; + rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1); + + if (rep == 1) { + rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0); + } else { + rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1); + rc_bit(&coder->rc, &coder->is_rep2[coder->state], + rep - 2); + + if (rep == 3) + coder->reps[3] = coder->reps[2]; + + coder->reps[2] = coder->reps[1]; + } + + coder->reps[1] = coder->reps[0]; + coder->reps[0] = distance; + } + + if (len == 1) { + coder->state = ((coder->state) < 7 ? STATE_LIT_SHORTREP : STATE_NONLIT_REP); + } else { + length(&coder->rc, &coder->rep_len_encoder, pos_state, len, + coder->fast_mode); + coder->state = ((coder->state) < 7 ? STATE_LIT_LONGREP : STATE_NONLIT_REP); + } +} + +// This array is constantly initialized in the original code. It's quite big +// so we skip it. +const uint8_t lzma_fastpos[1 << 13]; + +static inline uint32_t +get_dist_slot(uint32_t dist) +{ + if (dist < (1U << (13 + ((0) + (0) * (13 - 1))))) + return lzma_fastpos[dist]; + + if (dist < (1U << (13 + ((0) + (1) * (13 - 1))))) + return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (1) * (13 - 1))]) + 2 * ((0) + (1) * (13 - 1)); + + return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (2) * (13 - 1))]) + 2 * ((0) + (2) * (13 - 1)); +} + +static inline void +rc_bittree_reverse(lzma_range_encoder *rc, probability *probs, + uint32_t bit_count, uint32_t symbol) +{ + uint32_t model_index = 1; + do { + const uint32_t bit = symbol & 1; + symbol >>= 1; + rc_bit(rc, &probs[model_index], bit); + model_index = (model_index << 1) + bit; + } while (--bit_count != 0); +} + +static inline void +rc_direct(lzma_range_encoder *rc, uint32_t value, uint32_t bit_count) +{ + do { + rc->symbols[rc->count++] + = RC_DIRECT_0 + ((value >> --bit_count) & 1); + } while (bit_count != 0); +} + +static inline void +match(lzma_lzma1_encoder *coder, const uint32_t pos_state, + const uint32_t distance, const uint32_t len) +{ + coder->state = ((coder->state) < 7 ? STATE_LIT_MATCH : STATE_NONLIT_MATCH); + + length(&coder->rc, &coder->match_len_encoder, pos_state, len, + coder->fast_mode); + + const uint32_t dist_slot = get_dist_slot(distance); + const uint32_t dist_state = ((len) < 4 + 2 ? (len) - 2 : 4 - 1); + rc_bittree(&coder->rc, coder->dist_slot[dist_state], 6, dist_slot); + + if (dist_slot >= 4) { + const uint32_t footer_bits = (dist_slot >> 1) - 1; + const uint32_t base = (2 | (dist_slot & 1)) << footer_bits; + const uint32_t dist_reduced = distance - base; + + if (dist_slot < 14) { + rc_bittree_reverse(&coder->rc, coder->dist_special + base - dist_slot - 1, + footer_bits, dist_reduced); + } else { + rc_direct(&coder->rc, dist_reduced >> 4, + footer_bits - 4); + rc_bittree_reverse( + &coder->rc, coder->dist_align, + 4, dist_reduced & ((1 << 4) - 1)); + ++coder->align_price_count; + } + } + + coder->reps[3] = coder->reps[2]; + coder->reps[2] = coder->reps[1]; + coder->reps[1] = coder->reps[0]; + coder->reps[0] = distance; + ++coder->match_price_count; +} + +static void +encode_symbol(lzma_lzma1_encoder *coder, lzma_mf *mf, + uint32_t back, uint32_t len, uint32_t position) +{ + const uint32_t pos_state = position & coder->pos_mask; + + if (back == (4294967295U)) { + rc_bit(&coder->rc, + &coder->is_match[coder->state][pos_state], 0); + literal(coder, mf, position); + } else { + rc_bit(&coder->rc, + &coder->is_match[coder->state][pos_state], 1); + + if (back < 4) { + rc_bit(&coder->rc, &coder->is_rep[coder->state], 1); + rep_match(coder, pos_state, back, len); + } else { + rc_bit(&coder->rc, &coder->is_rep[coder->state], 0); + match(coder, pos_state, back - 4, len); + } + } + mf->read_ahead -= len; +} + +static void +encode_eopm(lzma_lzma1_encoder *coder, uint32_t position) +{ + const uint32_t pos_state = position & coder->pos_mask; + rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1); + rc_bit(&coder->rc, &coder->is_rep[coder->state], 0); + match(coder, pos_state, (4294967295U), 2); +} + +static inline void +rc_flush(lzma_range_encoder *rc) +{ + for (size_t i = 0; i < 5; ++i) + rc->symbols[rc->count++] = RC_FLUSH; +} + +extern void exit (int __status) + __attribute__ ((__nothrow__ , __leaf__ , __noreturn__)); + +extern lzma_ret +lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf, + uint8_t *restrict out, size_t *restrict out_pos, + size_t out_size, uint32_t limit) +{ + + if (!coder->is_initialized && !encode_init(coder, mf)) + return LZMA_OK; + + uint32_t position = mf_position(mf); + + while (1) { + if (rc_encode(&coder->rc, out, out_pos, out_size)) { + return LZMA_OK; + } + + if (limit != (4294967295U) + && (mf->read_pos - mf->read_ahead >= limit + || *out_pos + rc_pending(&coder->rc) + >= (1U << 16) - ((1 << 12) + 1))) + break; + + if (mf->read_pos >= mf->read_limit) { + if (mf->action == LZMA_RUN) + return LZMA_OK; + + + if (mf->read_ahead == 0) + break; + } + uint32_t len; + uint32_t back; + + if (coder->fast_mode) + lzma_lzma_optimum_fast(coder, mf, &back, &len); + else + // The original code contains the call to + // lzma_lzma_optimum_normal(coder, mf, &back, &len, position); + exit (-1); + + encode_symbol(coder, mf, back, len, position); + + position += len; + } + + if (!coder->is_flushed) { + coder->is_flushed = 1; + if (limit == (4294967295U)) + encode_eopm(coder, position); + + rc_flush(&coder->rc); + + if (rc_encode(&coder->rc, out, out_pos, out_size)) { + return LZMA_OK; + } + } + + coder->is_flushed = 0; + return LZMA_STREAM_END; +} + +extern void +lzma_free(void *ptr, const lzma_allocator *allocator) +{ + if (allocator != ((void *)0) && allocator->free != ((void *)0)) + allocator->free(allocator->opaque, ptr); + else + free(ptr); + return; +} + +static _Bool +lz_encoder_prepare(lzma_mf *mf, const lzma_allocator *allocator, + const lzma_lz_options *lz_options) +{ + if (lz_options->dict_size < 4096U + || lz_options->dict_size + > (1U << 30) + (1U << 29) + || lz_options->nice_len > lz_options->match_len_max) + return 1; + + mf->keep_size_before = lz_options->before_size + lz_options->dict_size; + mf->keep_size_after = lz_options->after_size + + lz_options->match_len_max; + uint32_t reserve = lz_options->dict_size / 2; + if (reserve > (1U << 30)) + reserve /= 2; + + reserve += (lz_options->before_size + lz_options->match_len_max + + lz_options->after_size) / 2 + (1U << 19); + + const uint32_t old_size = mf->size; + mf->size = mf->keep_size_before + reserve + mf->keep_size_after; + + if ((mf->buffer != ((void *)0)) && old_size != mf->size) { + lzma_free(mf->buffer, allocator); + mf->buffer = ((void *)0); + } + + mf->match_len_max = lz_options->match_len_max; + mf->nice_len = lz_options->nice_len; + mf->cyclic_size = lz_options->dict_size + 1; + + switch (lz_options->match_finder) { + case LZMA_MF_HC3: + mf->find = &lzma_mf_hc3_find; + mf->skip = &lzma_mf_hc3_skip; + break; + + case LZMA_MF_HC4: + mf->find = &lzma_mf_hc4_find; + mf->skip = &lzma_mf_hc4_skip; + break; + + case LZMA_MF_BT2: + mf->find = &lzma_mf_bt2_find; + mf->skip = &lzma_mf_bt2_skip; + break; + + case LZMA_MF_BT3: + mf->find = &lzma_mf_bt3_find; + mf->skip = &lzma_mf_bt3_skip; + break; + + case LZMA_MF_BT4: + mf->find = &lzma_mf_bt4_find; + mf->skip = &lzma_mf_bt4_skip; + break; + + default: + return 1; + } + + const uint32_t hash_bytes = lz_options->match_finder & 0x0F; + if (hash_bytes > mf->nice_len) + return 1; + + const _Bool is_bt = (lz_options->match_finder & 0x10) != 0; + uint32_t hs; + + if (hash_bytes == 2) { + hs = 0xFFFF; + } else { + hs = lz_options->dict_size - 1; + hs |= hs >> 1; + hs |= hs >> 2; + hs |= hs >> 4; + hs |= hs >> 8; + hs >>= 1; + hs |= 0xFFFF; + + if (hs > (1U << 24)) { + if (hash_bytes == 3) + hs = (1U << 24) - 1; + else + hs >>= 1; + } + } + + mf->hash_mask = hs; + + ++hs; + if (hash_bytes > 2) + hs += (1U << 10); + if (hash_bytes > 3) + hs += (1U << 16); + + const uint32_t old_hash_count = mf->hash_count; + const uint32_t old_sons_count = mf->sons_count; + mf->hash_count = hs; + mf->sons_count = mf->cyclic_size; + if (is_bt) + mf->sons_count *= 2; + + if (old_hash_count != mf->hash_count + || old_sons_count != mf->sons_count) { + lzma_free(mf->hash, allocator); + mf->hash = ((void *)0); + + lzma_free(mf->son, allocator); + mf->son = ((void *)0); + } + + mf->depth = lz_options->depth; + if (mf->depth == 0) { + if (is_bt) + mf->depth = 16 + mf->nice_len / 2; + else + mf->depth = 4 + mf->nice_len / 4; + } + + return 0; +} + +int +main () +{ + lzma_mf mf; + lzma_allocator allocator; + lzma_lz_options lz_options; + + void *coder; + uint8_t *restrict out; + size_t *restrict out_pos; + size_t out_size; + + lz_encoder_prepare(&mf, &allocator, &lz_options); + return (int) lzma_lzma_encode(coder, &mf, out, out_pos, out_size, (4294967295U)); +} + + +/* { dg-final { scan-wpa-ipa-dump "Save results of indirect call analysis." "icp"} } */ +/* { dg-final { scan-wpa-ipa-dump-times "For call" 2 "icp"} } */ +/* { dg-final { scan-wpa-ipa-dump-times "Insert 0 prefetch stmt:" 5 "ipa_prefetch"} } */ +/* { dg-final { scan-wpa-ipa-dump-times "Insert 1 prefetch stmt:" 4 "ipa_prefetch"} } */ +/* { dg-final { scan-wpa-ipa-dump-times "Insert 2 prefetch stmt:" 2 "ipa_prefetch"} } */ -- 2.33.0
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2