diff --git a/gcc/ai-optimizer.cc b/gcc/ai-optimizer.cc
index 8908d1be326f9f8943862d916e82ba384adaf2f0..e16ad59c889fe2db0c6da063b8d2b764fda20639 100644
--- a/gcc/ai-optimizer.cc
+++ b/gcc/ai-optimizer.cc
@@ -285,14 +285,15 @@ static int
 graph_infer (int argc1, const char **argv1, const char *mops,
              int argc2, int64_t *argv2)
 {
-  char gcc_exec_prefix[512];
+  const int prefix_buff_len = 512;
+  char gcc_exec_prefix[prefix_buff_len] = {0};
   ssize_t len = readlink ("/proc/self/exe", gcc_exec_prefix,
   			  sizeof (gcc_exec_prefix) - 1);
   if (len == -1)
     return 0;
 
-  char native_file[512];
-  strncpy (native_file, gcc_exec_prefix, sizeof (native_file) - 1);
+  char native_file[prefix_buff_len] = {0};
+  strncpy (native_file, gcc_exec_prefix, len);
   const char *target = "bin/gcc";
   const char *target_cc1 = "cc1";
   const char *target_gpp = "bin/g++";
@@ -330,6 +331,8 @@ graph_infer (int argc1, const char **argv1, const char *mops,
 		   strlen (native_file) - 1);
 	}
     }
+  else
+	return 0;
 
   if (access (native_file, F_OK) == 0)
     fill_node (native_file);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index fb5e355d0a855d6d28101b169cf2a08ca73b6caf..bcf919ed0cf26e79216a75aad2d464ef5d29f506 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6533,6 +6533,22 @@
   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
 )
 
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = ((A >> 15) & 0x00010001) * 0x00001111
+(define_insn "*aarch64_cmlt_as_arith2<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+      (mult:<V_INT_EQUIV>
+        (and:<V_INT_EQUIV>
+          (lshiftrt:<V_INT_EQUIV>
+            (match_operand:VDQHSD 1 "register_operand" "w")
+            (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+          (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+        (match_operand:VDQHSD 4 "half_bit_all_one_operand")))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+  [(set_attr "type" "neon_compare_zero")]
+)
+
 ;; Use cmlt to replace vector arithmetic operations like this (SImode example):
 ;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
 ;; TODO: maybe extend to scalar operations or other cm** instructions.
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index faa445e26219d96173c34f60dc28ef4be1ebc583..52cac0b82d79393c30c1281a935ff5db5947a831 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2326,7 +2326,7 @@ static const struct tune_params hip12_tunings =
   2,    /* min_div_recip_mul_df.  */
   0,    /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC),     /* tune_flags.  */
   &hip12_prefetch_tune
 };
 
@@ -17310,6 +17310,18 @@ cost_plus:
       return true;
 
     case MULT:
+	  op0 = XEXP (x, 0);
+	  op1 = XEXP (x, 1);
+	  if (flag_cmlt_arith && GET_CODE (op0) == AND)
+	{
+	  rtx op0_subop0 = XEXP (op0, 0);
+	  if (GET_CODE (op0_subop0) == LSHIFTRT)
+	    {
+	      *cost += rtx_cost (op0, mode, MULT, 0, speed);
+	      *cost += rtx_cost (op1, mode, MULT, 0, speed);
+	      return true;
+	    }
+	}
       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
       /* aarch64_rtx_mult_cost always handles recursion to its
 	 operands.  */
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 3ec9e9103f5e51d5a9ba323ab679ada7b3177ee1..28132873441ee67db369bdfaee32d6bed8fb46de 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -74,6 +74,15 @@
   return CONST_INT_P (op) && (UINTVAL (op) == mask);
 })
 
+(define_predicate "half_bit_all_one_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  unsigned long long mask = ((unsigned long long) 1 << size) - 1;
+  return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
 (define_predicate "subreg_lowpart_operator"
   (ior (match_code "truncate")
        (and (match_code "subreg")
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt-2.c b/gcc/testsuite/gcc.dg/combine-cmlt-2.c
new file mode 100755
index 0000000000000000000000000000000000000000..bb6a92b2d8c213baa6a256e3a53924ed8a3c4db8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith -mcpu=hip12" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo ().  It's inspired by sources of x264 codec.  */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+  for (unsigned i = 0; i < 4; i++)
+    {
+      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+      b[i] = (a[i]+s)^s;
+    }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */