From e142e97b4e522fc3bc0e78de9c5a77877150252b Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 9 May 2025 19:16:20 +0200 Subject: [PATCH 1/5] llama-bench : accept ranges for integer parameters --- tools/llama-bench/llama-bench.cpp | 657 ++++++++++++++++-------------- 1 file changed, 345 insertions(+), 312 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 5d26b506bd935..ba5d9725d039b 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -195,6 +195,30 @@ static std::string pair_str(const std::pair & p) { return buf; } +static std::vector parse_int_range(const std::string & s) { + // start[-end[+step]][,start[-end[+step]]...] + std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:\+(\d+))?)?(,|$))"); + std::smatch match; + std::string::const_iterator search_start(s.cbegin()); + std::vector result; + while (std::regex_search(search_start, s.cend(), match, range_regex)) { + int start = std::stoi(match[1]); + int end = match[2].matched ? std::stoi(match[2]) : start; + int step = match[3].matched ? std::stoi(match[3]) : 1; + + for (int i = start; i <= end; i += step) { + result.push_back(i); + } + search_start = match.suffix().first; + } + + if (search_start != s.cend()) { + throw std::invalid_argument("invalid range format"); + } + + return result; +} + struct cmd_params { std::vector model; std::vector n_prompt; @@ -326,7 +350,7 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " - "multiple times.\n"); + "multiple times. Ranges can be specified with 'start-end' or 'start-end+step'.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -380,186 +404,190 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (arg == "-h" || arg == "--help") { - print_usage(argc, argv); - exit(0); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.model.insert(params.model.end(), p.begin(), p.end()); - } else if (arg == "-p" || arg == "--n-prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); - } else if (arg == "-n" || arg == "--n-gen") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); - } else if (arg == "-pg") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], ','); - if (p.size() != 2) { - invalid_param = true; - break; - } - params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); - } else if (arg == "-d" || arg == "--n-depth") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_depth.insert(params.n_depth.end(), p.begin(), p.end()); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "-ub" || arg == "--ubatch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); - } else if (arg == "-ctk" || arg == "--cache-type-k") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector types; - for (const auto & t : p) { - ggml_type gt = ggml_type_from_name(t); - if (gt == GGML_TYPE_COUNT) { + try { + if (arg == "-h" || arg == "--help") { + print_usage(argc, argv); + exit(0); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { invalid_param = true; break; } - types.push_back(gt); - } - if (invalid_param) { - break; - } - params.type_k.insert(params.type_k.end(), types.begin(), types.end()); - } else if (arg == "-ctv" || arg == "--cache-type-v") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector types; - for (const auto & t : p) { - ggml_type gt = ggml_type_from_name(t); - if (gt == GGML_TYPE_COUNT) { + auto p = string_split(argv[i], split_delim); + params.model.insert(params.model.end(), p.begin(), p.end()); + } else if (arg == "-p" || arg == "--n-prompt") { + if (++i >= argc) { invalid_param = true; break; } - types.push_back(gt); - } - if (invalid_param) { - break; - } - params.type_v.insert(params.type_v.end(), types.begin(), types.end()); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); - } else if (arg == "-C" || arg == "--cpu-mask") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); - } else if (arg == "--cpu-strict") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); - } else if (arg == "--poll") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.poll.insert(params.poll.end(), p.begin(), p.end()); - } else if (arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rpc_servers.push_back(argv[i]); - } else if (arg == "-sm" || arg == "--split-mode") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - std::vector modes; - for (const auto & m : p) { - llama_split_mode mode; - if (m == "none") { - mode = LLAMA_SPLIT_MODE_NONE; - } else if (m == "layer") { - mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { - mode = LLAMA_SPLIT_MODE_ROW; - } else { + auto p = parse_int_range(argv[i]); + params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); + } else if (arg == "-n" || arg == "--n-gen") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); + } else if (arg == "-pg") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], ','); + if (p.size() != 2) { + invalid_param = true; + break; + } + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } else if (arg == "-d" || arg == "--n-depth") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_depth.insert(params.n_depth.end(), p.begin(), p.end()); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); + } else if (arg == "-ctk" || arg == "--cache-type-k") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + if (invalid_param) { + break; + } + params.type_k.insert(params.type_k.end(), types.begin(), types.end()); + } else if (arg == "-ctv" || arg == "--cache-type-v") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + if (invalid_param) { + break; + } + params.type_v.insert(params.type_v.end(), types.begin(), types.end()); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); + } else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); + } else if (arg == "--cpu-strict") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); + } else if (arg == "--poll") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.poll.insert(params.poll.end(), p.begin(), p.end()); + } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rpc_servers.push_back(argv[i]); + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_MODE_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_MODE_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_MODE_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + if (invalid_param) { + break; + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); + } else if (arg == "-mg" || arg == "--main-gpu") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.main_gpu = parse_int_range(argv[i]); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "--numa") { + if (++i >= argc) { invalid_param = true; break; } - modes.push_back(mode); - } - if (invalid_param) { - break; - } - params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); - } else if (arg == "-mg" || arg == "--main-gpu") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.main_gpu = string_split(argv[i], split_delim); - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "--numa") { - if (++i >= argc) { - invalid_param = true; - break; - } else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { + if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; @@ -569,28 +597,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - } - } else if (arg == "-fa" || arg == "--flash-attn") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); - } else if (arg == "-mmp" || arg == "--mmap") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); - } else if (arg == "-embd" || arg == "--embeddings") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); + } else if (arg == "-fa" || arg == "--flash-attn") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); + } else if (arg == "-mmp" || arg == "--mmap") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + } else if (arg == "-embd" || arg == "--embeddings") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); } else if (arg == "-nopo" || arg == "--no-op-offload") { if (++i >= argc) { invalid_param = true; @@ -598,148 +625,154 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); - } else if (arg == "-ts" || arg == "--tensor-split") { - if (++i >= argc) { - invalid_param = true; - break; - } - for (auto ts : string_split(argv[i], split_delim)) { - // split string by ; and / - const std::regex regex{ R"([;/]+)" }; - std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - GGML_ASSERT(split_arg.size() <= llama_max_devices()); - - std::vector tensor_split(llama_max_devices()); - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - tensor_split[i] = std::stof(split_arg[i]); - } else { - tensor_split[i] = 0.0f; + } else if (arg == "-ts" || arg == "--tensor-split") { + if (++i >= argc) { + invalid_param = true; + break; + } + for (auto ts : string_split(argv[i], split_delim)) { + // split string by ; and / + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= llama_max_devices()); + + std::vector tensor_split(llama_max_devices()); + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + tensor_split[i] = std::stof(split_arg[i]); + } else { + tensor_split[i] = 0.0f; + } } + params.tensor_split.push_back(tensor_split); } - params.tensor_split.push_back(tensor_split); - } - } else if (arg == "-ot" || arg == "--override-tensor") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto value = argv[i]; - /* static */ std::map buft_list; - if (buft_list.empty()) { - // enumerate all the devices and add their buffer types to the list - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - auto * dev = ggml_backend_dev_get(i); - auto * buft = ggml_backend_dev_buffer_type(dev); - if (buft) { - buft_list[ggml_backend_buft_name(buft)] = buft; + } else if (arg == "-ot" || arg == "--override-tensor") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto value = argv[i]; + /* static */ std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } } } - } - auto override_group_span_len = std::strcspn(value, ","); - bool last_group = false; - do { - if (override_group_span_len == 0) { - // Adds an empty override-tensors for an empty span - params.tensor_buft_overrides.push_back({{}}); + auto override_group_span_len = std::strcspn(value, ","); + bool last_group = false; + do { + if (override_group_span_len == 0) { + // Adds an empty override-tensors for an empty span + params.tensor_buft_overrides.push_back({{}}); + if (value[override_group_span_len] == '\0') { + value = &value[override_group_span_len]; + last_group = true; + } else { + value = &value[override_group_span_len + 1]; + override_group_span_len = std::strcspn(value, ","); + } + continue; + } + // Stamps null terminators into the argv + // value for this option to avoid the + // memory leak present in the implementation + // over in arg.cpp. Acceptable because we + // only parse these args once in this program. + auto override_group = value; if (value[override_group_span_len] == '\0') { value = &value[override_group_span_len]; last_group = true; } else { + value[override_group_span_len] = '\0'; value = &value[override_group_span_len + 1]; - override_group_span_len = std::strcspn(value, ","); - } - continue; - } - // Stamps null terminators into the argv - // value for this option to avoid the - // memory leak present in the implementation - // over in arg.cpp. Acceptable because we - // only parse these args once in this program. - auto override_group = value; - if (value[override_group_span_len] == '\0') { - value = &value[override_group_span_len]; - last_group = true; - } else { - value[override_group_span_len] = '\0'; - value = &value[override_group_span_len + 1]; - } - std::vector group_tensor_buft_overrides{}; - auto override_span_len = std::strcspn(override_group, ";"); - while (override_span_len > 0) { - auto override = override_group; - if (override_group[override_span_len] != '\0') { - override_group[override_span_len] = '\0'; - override_group = &override_group[override_span_len + 1]; - } else { - override_group = &override_group[override_span_len]; - } - auto tensor_name_span_len = std::strcspn(override, "="); - if (tensor_name_span_len >= override_span_len) { - invalid_param = true; - break; } - override[tensor_name_span_len] = '\0'; - auto tensor_name = override; - auto buffer_type = &override[tensor_name_span_len + 1]; - if (buft_list.find(buffer_type) == buft_list.end()) { - printf("Available buffer types:\n"); - for (const auto & it : buft_list) { - printf(" %s\n", ggml_backend_buft_name(it.second)); + std::vector group_tensor_buft_overrides{}; + auto override_span_len = std::strcspn(override_group, ";"); + while (override_span_len > 0) { + auto override = override_group; + if (override_group[override_span_len] != '\0') { + override_group[override_span_len] = '\0'; + override_group = &override_group[override_span_len + 1]; + } else { + override_group = &override_group[override_span_len]; } - invalid_param = true; + auto tensor_name_span_len = std::strcspn(override, "="); + if (tensor_name_span_len >= override_span_len) { + invalid_param = true; + break; + } + override[tensor_name_span_len] = '\0'; + auto tensor_name = override; + auto buffer_type = &override[tensor_name_span_len + 1]; + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + invalid_param = true; + break; + } + group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)}); + override_span_len = std::strcspn(override_group, ";"); + } + if (invalid_param) { break; } - group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)}); - override_span_len = std::strcspn(override_group, ";"); + group_tensor_buft_overrides.push_back({nullptr,nullptr}); + params.tensor_buft_overrides.push_back(group_tensor_buft_overrides); + override_group_span_len = std::strcspn(value, ","); + } while (!last_group); + } else if (arg == "-r" || arg == "--repetitions") { + if (++i >= argc) { + invalid_param = true; + break; } - if (invalid_param) { + params.reps = std::stoi(argv[i]); + } else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; break; } - group_tensor_buft_overrides.push_back({nullptr,nullptr}); - params.tensor_buft_overrides.push_back(group_tensor_buft_overrides); - override_group_span_len = std::strcspn(value, ","); - } while (!last_group); - } else if (arg == "-r" || arg == "--repetitions") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.reps = std::stoi(argv[i]); - } else if (arg == "--prio") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); - } else if (arg == "--delay") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.delay = std::stoi(argv[i]); - } else if (arg == "-o" || arg == "--output") { - if (++i >= argc) { - invalid_param = true; - break; - } - invalid_param = !output_format_from_str(argv[i], params.output_format); - } else if (arg == "-oe" || arg == "--output-err") { - if (++i >= argc) { + params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); + } else if (arg == "--delay") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.delay = std::stoi(argv[i]); + } else if (arg == "-o" || arg == "--output") { + if (++i >= argc) { + invalid_param = true; + break; + } + invalid_param = !output_format_from_str(argv[i], params.output_format); + } else if (arg == "-oe" || arg == "--output-err") { + if (++i >= argc) { + invalid_param = true; + break; + } + invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); + } else if (arg == "-v" || arg == "--verbose") { + params.verbose = true; + } else if (arg == "--progress") { + params.progress = true; + } else { invalid_param = true; break; } - invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); - } else if (arg == "-v" || arg == "--verbose") { - params.verbose = true; - } else if (arg == "--progress") { - params.progress = true; - } else { + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); invalid_param = true; break; } } + if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); print_usage(argc, argv); From 9c47aafb810d12f26572637afc56911e1077bc5b Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 9 May 2025 19:58:52 +0200 Subject: [PATCH 2/5] start-end => first-last --- tools/llama-bench/llama-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ba5d9725d039b..5a301ad207198 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -350,7 +350,7 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " - "multiple times. Ranges can be specified with 'start-end' or 'start-end+step'.\n"); + "multiple times. Ranges can be specified with 'first-last' or 'first-last+step'.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { From 5a7754a252b2f759b9fabcf12de50cd810934f7f Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 9 May 2025 20:35:34 +0200 Subject: [PATCH 3/5] update variable names --- tools/llama-bench/llama-bench.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 5a301ad207198..3acab74208c2b 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -196,17 +196,17 @@ static std::string pair_str(const std::pair & p) { } static std::vector parse_int_range(const std::string & s) { - // start[-end[+step]][,start[-end[+step]]...] + // first[-last[+step]][,first[-last[+step]]...] std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:\+(\d+))?)?(,|$))"); std::smatch match; std::string::const_iterator search_start(s.cbegin()); std::vector result; while (std::regex_search(search_start, s.cend(), match, range_regex)) { - int start = std::stoi(match[1]); - int end = match[2].matched ? std::stoi(match[2]) : start; + int first = std::stoi(match[1]); + int last = match[2].matched ? std::stoi(match[2]) : first; int step = match[3].matched ? std::stoi(match[3]) : 1; - for (int i = start; i <= end; i += step) { + for (int i = first; i <= last; i += step) { result.push_back(i); } search_start = match.suffix().first; From a1754641a2c3aa3eb1112e000e79fea2f066b3d3 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 11 May 2025 20:12:05 +0200 Subject: [PATCH 4/5] add exponential steps, update documentation --- tools/llama-bench/README.md | 30 +++++++----- tools/llama-bench/llama-bench.cpp | 76 +++++++++++++++++++------------ 2 files changed, 66 insertions(+), 40 deletions(-) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index d6fc77df880bd..4512a4649a9b8 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,10 +20,20 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: ./llama-bench [options] +usage: build/bin/llama-bench [options] options: -h, --help + --numa numa mode (default: disabled) + -r, --repetitions number of times to repeat each test (default: 5) + --prio <0|1|2|3> process/thread priority (default: 0) + --delay <0...N> (seconds) delay between each test (default: 0) + -o, --output output format printed to stdout (default: md) + -oe, --output-err output format printed to stderr (default: none) + -v, --verbose verbose output + --progress print test progress indicators + +test parameters: -m, --model (default: models/7B/ggml-model-q4_0.gguf) -p, --n-prompt (default: 512) -n, --n-gen (default: 128) @@ -33,7 +43,7 @@ options: -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) -ctv, --cache-type-v (default: f16) - -t, --threads (default: 8) + -t, --threads (default: 16) -C, --cpu-mask (default: 0x0) --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) @@ -44,17 +54,15 @@ options: -nkvo, --no-kv-offload <0|1> (default: 0) -fa, --flash-attn <0|1> (default: 0) -mmp, --mmap <0|1> (default: 1) - --numa (default: disabled) -embd, --embeddings <0|1> (default: 0) -ts, --tensor-split (default: 0) - -r, --repetitions (default: 5) - --prio <0|1|2|3> (default: 0) - --delay <0...N> (seconds) (default: 0) - -o, --output (default: md) - -oe, --output-err (default: none) - -v, --verbose (default: 0) - -Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. + -ot --override-tensors =;... + (default: disabled) + -nopo, --no-op-offload <0|1> (default: 0) + +Multiple values can be given for each parameter by separating them with ',' +or by specifying the parameter multiple times. Ranges can be given as +'start-end' or 'start-end+step' or 'start-end*mult'. ``` llama-bench can perform three types of tests: diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 3acab74208c2b..ca0d0aed5e9fa 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -196,18 +196,28 @@ static std::string pair_str(const std::pair & p) { } static std::vector parse_int_range(const std::string & s) { - // first[-last[+step]][,first[-last[+step]]...] - std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:\+(\d+))?)?(,|$))"); + // first[-last[(+|*)step]] + std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); + std::smatch match; std::string::const_iterator search_start(s.cbegin()); std::vector result; while (std::regex_search(search_start, s.cend(), match, range_regex)) { - int first = std::stoi(match[1]); - int last = match[2].matched ? std::stoi(match[2]) : first; - int step = match[3].matched ? std::stoi(match[3]) : 1; + int first = std::stoi(match[1]); + int last = match[2].matched ? std::stoi(match[2]) : first; + char op = match[3].matched ? match[3].str()[0] : '+'; + int step = match[4].matched ? std::stoi(match[4]) : 1; - for (int i = first; i <= last; i += step) { + for (int i = first; i <= last;) { result.push_back(i); + + if (op == '+') { + i += step; + } else if (op == '*') { + i *= step; + } else { + throw std::invalid_argument("invalid range format"); + } } search_start = match.suffix().first; } @@ -275,7 +285,7 @@ static const cmd_params cmd_params_defaults = { /* no_kv_offload */ { false }, /* flash_attn */ { false }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, - /* tensor_buft_overrides*/ { std::vector{{nullptr,nullptr}} }, + /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, /* use_mmap */ { true }, /* embeddings */ { false }, /* no_op_offload */ { false }, @@ -294,13 +304,29 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); + printf(" --numa numa mode (default: disabled)\n"); + printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", + cmd_params_defaults.reps); + printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n", + cmd_params_defaults.prio); + printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", + cmd_params_defaults.delay); + printf(" -o, --output output format printed to stdout (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err output format printed to stderr (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" -v, --verbose verbose output\n"); + printf(" --progress print test progress indicators\n"); + printf("\n"); + printf("test parameters:\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -d, --n-depth (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str()); + printf(" -d, --n-depth (default: %s)\n", + join(cmd_params_defaults.n_depth, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -ub, --ubatch-size (default: %s)\n", @@ -332,25 +358,17 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" --numa (default: disabled)\n"); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); - printf(" -ot --override-tensors =;... (default: disabled)\n"); - printf(" -nopo, --no-op-offload (default: 0)\n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); - printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); - printf(" -o, --output (default: %s)\n", - output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err (default: %s)\n", - output_format_str(cmd_params_defaults.output_format_stderr)); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); - printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0"); + printf(" -ot --override-tensors =;...\n"); + printf(" (default: disabled)\n"); + printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); printf("\n"); printf( - "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " - "multiple times. Ranges can be specified with 'first-last' or 'first-last+step'.\n"); + "Multiple values can be given for each parameter by separating them with ','\n" + "or by specifying the parameter multiple times. Ranges can be given as\n" + "'start-end' or 'start-end+step' or 'start-end*mult'.\n"); } static ggml_type ggml_type_from_name(const std::string & s) { @@ -618,13 +636,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); - } else if (arg == "-nopo" || arg == "--no-op-offload") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); + } else if (arg == "-nopo" || arg == "--no-op-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; From ed2bd01045bace048a988f92e27bae7898db27c2 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 12 May 2025 13:07:45 +0200 Subject: [PATCH 5/5] doc fix [no ci] --- tools/llama-bench/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 4512a4649a9b8..4fb2a24e19e10 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,7 +20,7 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: build/bin/llama-bench [options] +usage: llama-bench [options] options: -h, --help