diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md
index afe506b230f..1edab9433b8 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Embedding/README.md
@@ -44,7 +44,7 @@ Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (i.e. `maidalun1020/bce-embedding-base_v1`) to be downloaded, or the path to the huggingface checkpoint folder.
 - `--prompt PROMPT`: argument defining the sentences to encode.
 - `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`.
-- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
 #### Sample Output
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
index fb30271a223..e9482793b81 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
@@ -78,7 +78,7 @@ Arguments info:
 - `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g.`Meta-llama/Llama-2-7b-chat-hf` for Llama2-7B) to be downloaded, or the path to the huggingface checkpoint folder.
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted model will be saved into `SAVE_DIRECTORY`.
 - `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`.
-- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
 
 ## 3. Build C++ Example `llm-npu-cli`
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
index 103e82f3318..7a22d567958 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py
@@ -47,7 +47,7 @@
             "Else, program will raise error.",
     )
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--quantization-group-size", type=int, default=0)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
index 849f8cc153b..14c81a21a9d 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -104,7 +104,7 @@ Arguments info:
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `"What is AI?"` or `"AI是什么？"`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 - `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`.
-- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
 - `--disable-streaming`: argument defining whether to disable the streaming mode for generation.
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
index 018ad27d18b..a355e6b476e 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/baichuan2.py
@@ -41,7 +41,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--disable-streaming", action="store_true", default=False)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py
index 292e4b4d572..919e5fff3b5 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/glm.py
@@ -41,7 +41,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--disable-streaming", action="store_true", default=False)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
index 21e1db5f6c6..c3af421eb14 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama2.py
@@ -54,7 +54,7 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--disable-streaming", action="store_true", default=False)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py
index ed54d1ce993..e0906352464 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/llama3.py
@@ -55,7 +55,7 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--disable-streaming", action="store_true", default=False)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py
index 90e99db21d5..e91c547bdb8 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/minicpm.py
@@ -41,7 +41,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--disable-streaming", action="store_true", default=False)
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
index 2afc5c20508..585256c72fd 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py
@@ -41,7 +41,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument("--quantization-group-size", type=int, default=0)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
index caf21cf891d..cc5b668a317 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/README.md
@@ -58,7 +58,7 @@ Arguments info:
 - `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `"What is in this image?"`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 - `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`.
-- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
 - `--save-directory SAVE_DIRECTORY`: argument defining the path to save converted model. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, otherwise the lowbit model in `SAVE_DIRECTORY` will be loaded.
 
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
index 03beaf7e3e7..22ddb786fa4 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm-llama3-v2.5.py
@@ -46,7 +46,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--save-directory", type=str,
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
index a03c13cb657..195d82b9fa7 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Multimodal/minicpm_v_2_6.py
@@ -37,7 +37,7 @@
                         help='Prompt to infer')
     parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict.")
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')
     parser.add_argument("--save-directory", type=str,
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md
index dcde4ac5806..511af9e07c5 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/README.md
@@ -46,7 +46,7 @@ In the example, several arguments can be passed to satisfy your requirements:
 - `--prompt PROMPT`: argument defining the prompt to be inferred (with integrated prompt format for chat). It is default to be `'What is AI?'`.
 - `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
 - `--max-context-len MAX_CONTEXT_LEN`: argument defining the maximum sequence length for both input and output tokens. It is default to be `1024`.
-- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `960`.
+- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--low-bit` LOW_BIT: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
 
 ### Sample Output
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py
index 44a924a706a..5cbeed612e0 100644
--- a/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py
+++ b/python/llm/example/NPU/HF-Transformers-AutoModels/Save-Load/generate.py
@@ -44,7 +44,7 @@
     parser.add_argument('--n-predict', type=int, default=32,
                         help='Max tokens to predict')
     parser.add_argument("--max-context-len", type=int, default=1024)
-    parser.add_argument("--max-prompt-len", type=int, default=960)
+    parser.add_argument("--max-prompt-len", type=int, default=512)
     parser.add_argument('--low-bit', type=str, default="sym_int4",
                         help='Low bit optimizations that will be applied to the model.')