From 750aaa8207054626c80d6a5dec27c023adec977b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 17 Mar 2025 16:16:38 +0800 Subject: [PATCH 01/34] add log --- src/turbomind/models/llama/LlamaBatch.cc | 5 ++++- src/turbomind/models/llama/LlamaBatch.h | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 27471ba90e..5b2f9c12a8 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -242,7 +242,10 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorgen_cfg.max_new_tokens) std::vector seq_len_limit; std::vector sequences; From 7b4304a3fe67c916c9dc67bd3c625146badca9cc Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 24 Mar 2025 22:47:23 +0800 Subject: [PATCH 02/34] refactor tm prefix caching --- src/turbomind/models/llama/BlockTrie.cc | 82 +++++++---------- src/turbomind/models/llama/BlockTrie.h | 49 ++++++---- src/turbomind/models/llama/LlamaBatch.cc | 14 ++- src/turbomind/models/llama/SequenceManager.cc | 89 +++++++++++-------- src/turbomind/models/llama/SequenceManager.h | 18 +++- 5 files changed, 143 insertions(+), 109 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index 5f87e98280..e631449726 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -14,16 +14,17 @@ size_t hash(const std::vector& vec) return seed; } -BlockTrie::BlockTrie(size_t block_seq_len, std::shared_ptr block_manager, bool enable_prefix_caching): - block_seq_len_(block_seq_len), block_manager_(block_manager), enable_prefix_caching_(enable_prefix_caching) +BlockTrie::BlockTrie(size_t block_seq_len, std::shared_ptr block_manager): + block_seq_len_(block_seq_len), block_manager_(block_manager) { root_ = std::make_shared(); } -void BlockTrie::match(Sequence& seq) +std::tuple>> BlockTrie::match(const Sequence& seq) { BlockIds matched_blocks; UniqueIds matched_unique_ids; + std::vector> matched_nodes; std::shared_ptr curr_node = root_; int num_matched = 0; @@ -34,50 +35,53 @@ void BlockTrie::match(Sequence& seq) size_t hash_key = hash(curr_tokens); auto it = curr_node->children.find(hash_key); - if (it == curr_node->children.end()) { break; } - if (curr_tokens != it->second->tokens) { + TM_LOG_WARNING("hash key cache hit, but tokens are not the same"); break; } - matched_blocks.push_back(it->second->block_id); - matched_unique_ids.push_back(it->second->block_unique_id); + matched_blocks.emplace_back(it->second->block_id); + matched_unique_ids.emplace_back(it->second->block_unique_id); + matched_nodes.emplace_back(it->second); curr_node = it->second; num_matched += block_seq_len_; } - - if (matched_blocks.size() > 0) { - // add use count - block_manager_->Lock(matched_blocks); - block_manager_->Touch(matched_blocks); - // only consider no history blocks - seq.blocks.insert(seq.blocks.end(), matched_blocks.begin(), matched_blocks.end()); - seq.block_unique_ids.insert(seq.block_unique_ids.end(), matched_unique_ids.begin(), matched_unique_ids.end()); - } + return std::tuple(matched_blocks, matched_unique_ids, matched_nodes); } -void BlockTrie::cache(const Sequence& seq) +std::pair BlockTrie::cache(const Sequence& seq, const std::vector& tokens) { + FT_CHECK(tokens.size() >= seq.blocks.size() * block_seq_len_); + std::shared_ptr curr_node = root_; - int num_matched = 0; int idx = 0; - BlockIds cached_blocks; - while (num_matched + block_seq_len_ <= seq.prompt.size()) { - std::vector curr_tokens(seq.prompt.begin() + num_matched, - seq.prompt.begin() + num_matched + block_seq_len_); - size_t hash_key = hash(curr_tokens); + BlockIds cache_block_ids; + UniqueIds cache_block_unique_ids; - auto it = curr_node->children.find(hash_key); + // Only cache valid blocks + int valid_blocks = block_manager_->Verify(seq.blocks, seq.block_unique_ids); + + // We don't cache the last block of the sequence, since it might not be full + // TODO(lvhan): determine wether the last block is full or not. It is not trivial + // considering chunk prefill + for (int idx = 0; idx < valid_blocks - 1; ++idx) { + auto start = tokens.begin() + idx * block_seq_len_; + auto end = start + block_seq_len_; + std::vector curr_tokens(start, end); + // TODO(lvhan): add salt to ensure the hash security + size_t hash_key = hash(curr_tokens); int block_id = seq.blocks[idx]; uint64_t block_unique_id = seq.block_unique_ids[idx]; + auto it = curr_node->children.find(hash_key); if (it != curr_node->children.end()) { if (curr_tokens != it->second->tokens) { + TM_LOG_WARNING("hash key cache hit, but tokens are not the same"); break; } curr_node = it->second; @@ -91,38 +95,14 @@ void BlockTrie::cache(const Sequence& seq) node->tokens = curr_tokens; node->block_id = block_id; node->block_unique_id = block_unique_id; - node->num_matched = num_matched + block_seq_len_; curr_node->children[hash_key] = node; curr_node = node; } - - cached_blocks.push_back(curr_node->block_id); - num_matched += block_seq_len_; - idx++; + cache_block_ids.emplace_back(block_id); + cache_block_unique_ids.emplace_back(block_unique_id); } - block_manager_->Touch(cached_blocks); -} - -int BlockTrie::verify() -{ - return verify_traverse(root_); -} - -int BlockTrie::verify_traverse(std::shared_ptr& node) -{ - int valid_count = 1; - for (auto it = node->children.begin(); it != node->children.end();) { - if (block_manager_->unique_id(it->second->block_id) != it->second->block_unique_id) { - // child invalid - it = node->children.erase(it); - } - else { - valid_count += verify_traverse(it->second); - it++; - } - } - return valid_count; + return std::pair(cache_block_ids, cache_block_unique_ids); } } // namespace turbomind diff --git a/src/turbomind/models/llama/BlockTrie.h b/src/turbomind/models/llama/BlockTrie.h index b48c00061c..eb06a5d95d 100644 --- a/src/turbomind/models/llama/BlockTrie.h +++ b/src/turbomind/models/llama/BlockTrie.h @@ -17,32 +17,45 @@ struct TrieNode { std::vector tokens; int block_id; uint64_t block_unique_id; - int num_matched; }; class BlockTrie { public: - explicit BlockTrie(size_t block_len_, std::shared_ptr block_manager, bool enable_prefix_caching); + explicit BlockTrie(size_t block_len, std::shared_ptr block_manager); - bool enabled() - { - return enable_prefix_caching_; - } + /** @brief Attempt to match cached key-value (KV) blocks for a given sequence. + * + * This function iterates the tokens of the sequence and attempts + * to match them with the cached KV blocks. If the max prefix match is found, + * it returns the IDs, unique IDs, and hash keys of the matched blocks. + * + * @param seq The sequence whose tokens are to be matched against the cached KV blocks. + * @return A tuple containing the following: + * - BlockIds: A list of IDs of the matched blocks. + * - UniqueIds: A list of unique IDs of the matched blocks. + * - std::vector>: A list of matched node + * + * @note If no blocks are matched, all containers in the returned tuple will be empty. + */ + std::tuple>> match(const Sequence& seq); - // get cached blocks for sequence - void match(Sequence& seq); - - // cache computed blocks for sequence - void cache(const Sequence& seq); - - // remove invalid nodes, return valid count - int verify(); - -private: - int verify_traverse(std::shared_ptr& node); + /** + * @brief Cache the key-value (KV) blocks of a given sequence. + * + * This function caches the KV blocks of the specified sequence. Only valid blocks + * of a sequence whose status is NOT `Sequence::kCached` are considered + * for caching. + * + * @param seq The sequence whose KV blocks are to be cached. + * @param tokens The token list that the quence's KV blocks map + * @return A pair of vectors containing the IDs and unique IDs of the successfully + cached blocks. If no blocks are cached, a pair of empty vectors is returned. + * + * @note Only valid blocks of a non-kCached sequence are processed for caching. + */ + std::pair cache(const Sequence& seq, const std::vector& tokens); private: - bool enable_prefix_caching_; size_t block_seq_len_; std::shared_ptr block_manager_; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 5b2f9c12a8..f4dc84fe9c 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -224,6 +224,16 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectortokens.size(); } + else if (s == ptr->tokens.size()) { + if (rank_ == 0) { + TM_LOG_INFO("[ProcessInferRequests] ID %lu, step(%d) == tokens(%d)", ptr->id, s, ptr->tokens.size()); + } + } + else if (s != 0) { + if (rank_ == 0) { + TM_LOG_WARNING("[ProcessInferRequests] ID %lu, step(%d) < tokens(%d)", ptr->id, s, ptr->tokens.size()); + } + } return s; }(); @@ -1486,7 +1496,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) } // Cache computed blocks to block trie - sequence_manager_->CacheIfEnabled(state_->sequences, batch_size); + sequence_manager_->CachePrompt(state_->sequences, batch_size); if (debug_ && rank_ == 0) { for (int i = 0; i < batch_size; ++i) { @@ -1587,6 +1597,8 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig // output_ids is updated & synced in `Finish` const auto output_ids = state_->requests[index]->output_ids.getPtr(); std::copy_n(output_ids, output_len, seq.tokens.data()); + // Cache the generated tokens of the sequence + sequence_manager_->CacheGeneration(seq); // Save random state in host memory seq.random_state.resize(sizeof(curandState_t)); diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 4851a0fc44..4d1b5d6334 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -11,9 +11,27 @@ #include #include #include +#include namespace turbomind { +template +std::string serialize_vector(const std::vector& data) +{ + if (data.empty()) { + return ""; + } + + std::stringstream ss; + auto it = data.begin(); + ss << *it; + + for (++it; it < data.end(); ++it) { + ss << ", " << *it; + } + return ss.str(); +} + SequenceManager::SequenceManager(size_t layer_num, const BlockConfig& block_config, double block_count, @@ -30,7 +48,9 @@ SequenceManager::SequenceManager(size_t layer_num, size_t block_size = layout.block_size(layer_num); block_manager_ = std::make_shared(block_size, block_count, chunk_size, allocator, get_free_size); - block_trie_ = std::make_shared(block_config.block_len_, block_manager_, enable_prefix_caching); + if (enable_prefix_caching) { + block_trie_ = std::make_shared(block_config.block_len_, block_manager_); + } } const Sequence* SequenceManager::Create(uint64_t id) @@ -71,7 +91,7 @@ void SequenceManager::Erase(std::map::iterator& it) UpdateAndSetUnlock(seq); } // if prefix cache enabled, blocks will be shared by sequences, cannot be freed immediately - if (!block_trie_->enabled()) { + if (block_trie_) { freed_.insert(freed_.end(), seq.blocks.begin(), seq.blocks.end()); } it = sequences_.erase(it); @@ -86,21 +106,29 @@ bool SequenceManager::Erase(uint64_t id) return false; } -void SequenceManager::CacheIfEnabled(const Sequences& sequences, int active_size) +void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) { - if (block_trie_->enabled()) { - block_trie_->verify(); + if (block_trie_) { for (int i = 0; i < active_size; ++i) { auto& seq = *sequences[i]; - // only cache prompt blocks - if (!seq.prompt.empty()) { - block_trie_->cache(seq); - seq.prompt.clear(); - } + BlockIds block_ids; + UniqueIds block_unique_ids; + std::tie(block_ids, block_unique_ids) = block_trie_->cache(seq, seq.prompt); + // TODO: } } } +void SequenceManager::CacheGeneration(const Sequence& seq) +{ + if (block_trie_) { + BlockIds block_ids; + UniqueIds block_unique_ids; + std::tie(block_ids, block_unique_ids) = block_trie_->cache(seq, seq.tokens); + // TODO: + } +} + void SequenceManager::VerifyAndLockCached(const Sequences& sequences) { BlockIds blocks; @@ -323,25 +351,6 @@ void SequenceManager::SortByPriority(Sequences& sequences, context_lengths.swap(tmp_lengths); } -// template -// void SortByPriority(const std::vector

& priorities, Ts&... ranges) -// { -// // sort according to priority -// std::vector idxs(priorities.size()); -// std::iota(idxs.begin(), idxs.end(), 0); -// std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { -// return priorities[i] < priorities[j]; // -// }); -// auto reorder = [&](auto& src) { -// auto dst = src; -// for (size_t i = 0; i < idxs.size(); ++i) { -// dst[i] = src[idxs[i]]; -// } -// src.swap(dst); -// }; -// (reorder(ranges), ...); -// } - std::vector SequenceManager::CountRequiredBlocks(const Sequences& sequences, const std::vector& context_lengths, int step_length) @@ -394,17 +403,23 @@ auto SequenceManager::Materialize(Sequences sequences, // the blocks can still be preempted later VerifyAndLockCached(sequences); - if (block_trie_->enabled()) { - // verify blocks in trie cache - block_trie_->verify(); - + if (block_trie_) { // match prefix cache for (int i = 0; i < sequences.size(); i++) { - if (!sequences[i]->prompt.empty() && sequences[i]->blocks.empty()) { - auto& seq = const_cast(*sequences[i]); - block_trie_->match(seq); - seq.cache_len = seq.blocks.size() * block_seq_len_; + BlockIds block_ids; + UniqueIds unique_ids; + std::vector> matched_nodes; + auto& seq = *sequences[i]; + + std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); + const int count = block_manager_->Verify(block_ids, unique_ids); + seq.cache_len = count * block_seq_len_; + if (rank_ == 0) { + TM_LOG_DEBUG("matched block_ids %s, unique_ids %s", serialize_vector(block_ids), serialize_vector(unique_ids), count); + TM_LOG_DEBUG("valid block count %d, cache_len %d", count, seq.cache_len); } + // TODO: remove invalid node in `block_trie_` + // How to retrieve the invalid node root in O(1), matched_values -> matched_nodes (the match path) } } diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index a71a556aaa..b33775bf8d 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -107,8 +107,22 @@ class SequenceManager { const std::vector& priorities, int step_length, AdjustInputCount adjust); - - void CacheIfEnabled(const Sequences& sequences, int active_size); + /** @brief cache the input prompt tokens of each seq in sequences[0:active_size-1] + * + * @param sequences The sequence list + * @param active_size the number of active sequences in the list + */ + void CachePrompt(const Sequences& sequences, int active_size); + + /** @brief cache the generated tokens of a given sequence + * + * @param sequence the given sequence + * + * @note This function can only be called after the sequence finish generation + * and all tokens including the prompt tokens and generated tokens have been put to + * `seq.tokens` + */ + void CacheGeneration(const Sequence& sequence); [[nodiscard]] void* GetBlockPtr(int block_id) { From 8be44f83d2a8d3f04fe27ce66e148bac25a8aae9 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 25 Mar 2025 17:31:36 +0800 Subject: [PATCH 03/34] refactor tm prefix cache --- src/turbomind/models/llama/BlockTrie.cc | 34 ++++-- src/turbomind/models/llama/BlockTrie.h | 19 ++- src/turbomind/models/llama/LlamaBatch.cc | 2 +- src/turbomind/models/llama/SequenceManager.cc | 112 ++++++++++++------ src/turbomind/models/llama/SequenceManager.h | 2 + 5 files changed, 122 insertions(+), 47 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index e631449726..6bdf9de1a0 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -20,7 +20,7 @@ BlockTrie::BlockTrie(size_t block_seq_len, std::shared_ptr block_m root_ = std::make_shared(); } -std::tuple>> BlockTrie::match(const Sequence& seq) +std::tuple>> BlockTrie::match(const Sequence& seq) const { BlockIds matched_blocks; UniqueIds matched_unique_ids; @@ -52,23 +52,26 @@ std::tuple>> BlockTri return std::tuple(matched_blocks, matched_unique_ids, matched_nodes); } -std::pair BlockTrie::cache(const Sequence& seq, const std::vector& tokens) +std::tuple>> BlockTrie::cache(const Sequence& seq, const std::vector& tokens) { - FT_CHECK(tokens.size() >= seq.blocks.size() * block_seq_len_); + TM_LOG_INFO("[cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); + FT_CHECK(seq.status != Sequence::kCached); + FT_CHECK(tokens.size() <= seq.blocks.size() * block_seq_len_); std::shared_ptr curr_node = root_; int idx = 0; BlockIds cache_block_ids; UniqueIds cache_block_unique_ids; + std::vector> cache_nodes; - // Only cache valid blocks - int valid_blocks = block_manager_->Verify(seq.blocks, seq.block_unique_ids); + // // Only cache valid blocks + // int valid_blocks = block_manager_->Verify(seq.blocks, seq.block_unique_ids); // We don't cache the last block of the sequence, since it might not be full // TODO(lvhan): determine wether the last block is full or not. It is not trivial // considering chunk prefill - for (int idx = 0; idx < valid_blocks - 1; ++idx) { + for (int idx = 0; idx < seq.blocks.size() - 1; ++idx) { auto start = tokens.begin() + idx * block_seq_len_; auto end = start + block_seq_len_; std::vector curr_tokens(start, end); @@ -100,9 +103,26 @@ std::pair BlockTrie::cache(const Sequence& seq, const std:: } cache_block_ids.emplace_back(block_id); cache_block_unique_ids.emplace_back(block_unique_id); + cache_nodes.emplace_back(curr_node); } - return std::pair(cache_block_ids, cache_block_unique_ids); + return std::make_tuple(cache_block_ids, cache_block_unique_ids, cache_nodes); +} + + +void BlockTrie::Remove(const std::vector>& nodes, int valid_size) { + if (nodes.empty() || valid_size < 1 ) { + return; + } + // visit nodes in reverse order + for (int idx = nodes.size() - 1; idx >= valid_size; --idx) { + auto child = nodes[idx]; + auto parent = nodes[idx - 1]; + auto it = parent->children.find(child->hash_key); + FT_CHECK(it != parent->children.end()); + FT_CHECK(it->second->tokens == child->tokens); + parent->children.erase(it); + } } } // namespace turbomind diff --git a/src/turbomind/models/llama/BlockTrie.h b/src/turbomind/models/llama/BlockTrie.h index eb06a5d95d..f6fd71fc37 100644 --- a/src/turbomind/models/llama/BlockTrie.h +++ b/src/turbomind/models/llama/BlockTrie.h @@ -37,7 +37,7 @@ class BlockTrie { * * @note If no blocks are matched, all containers in the returned tuple will be empty. */ - std::tuple>> match(const Sequence& seq); + std::tuple>> match(const Sequence& seq) const; /** * @brief Cache the key-value (KV) blocks of a given sequence. @@ -48,13 +48,20 @@ class BlockTrie { * * @param seq The sequence whose KV blocks are to be cached. * @param tokens The token list that the quence's KV blocks map - * @return A pair of vectors containing the IDs and unique IDs of the successfully - cached blocks. If no blocks are cached, a pair of empty vectors is returned. - * - * @note Only valid blocks of a non-kCached sequence are processed for caching. + * @return A tuple containing the following: + * - BlockIds: A list of IDs of the cached blocks. + * - UniqueIds: A list of unique IDs of the cached blocks. + * - std::vector>: A list of cached node */ - std::pair cache(const Sequence& seq, const std::vector& tokens); + std::tuple>> cache(const Sequence& seq, const std::vector& tokens); + + /** @brief remove nodes[valid_size:] in a visited path from the trie tree + * @param nodes a visited path returned by `match` or `cache` + * @param valid_size the valid number of cached blocks from the beginning of the path + * @note the visited path must be the returned value from `match` or `cache` + */ + void Remove(const std::vector>& nodes, int valid_size); private: size_t block_seq_len_; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index f4dc84fe9c..85b9ff121b 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1583,7 +1583,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - if (state_->requests[index]->session.end_flag || force_end) { + if (/*state_->requests[index]->session.end_flag ||*/ force_end) { // Sequence is ending this round or a stop request is issued to end it FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); } diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 4d1b5d6334..1c8c122f7c 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -19,14 +19,14 @@ template std::string serialize_vector(const std::vector& data) { if (data.empty()) { - return ""; + return "nil"; } std::stringstream ss; auto it = data.begin(); ss << *it; - for (++it; it < data.end(); ++it) { + for (++it; it != data.end(); ++it) { ss << ", " << *it; } return ss.str(); @@ -108,24 +108,48 @@ bool SequenceManager::Erase(uint64_t id) void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) { - if (block_trie_) { - for (int i = 0; i < active_size; ++i) { - auto& seq = *sequences[i]; - BlockIds block_ids; - UniqueIds block_unique_ids; - std::tie(block_ids, block_unique_ids) = block_trie_->cache(seq, seq.prompt); - // TODO: + if (!block_trie_) { + return; + } + for (int i = 0; i < active_size; ++i) { + auto& seq = *sequences[i]; + if (seq.cache_len > seq.prompt.size()) { + // seq prefill finished. We don't cache the prompt any longer + continue; + } + BlockIds block_ids; + UniqueIds block_unique_ids; + std::vector> nodes; + std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.prompt); + int valid = block_manager_->Verify(block_ids, block_unique_ids); + if (rank_ == 0) { + TM_LOG_INFO("[CachePrompt] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", + seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str(), valid); + } + // remove invalid nodes from trie tree if there is any + if (valid < block_ids.size()) { + block_trie_->Remove(nodes, valid); } } } void SequenceManager::CacheGeneration(const Sequence& seq) { - if (block_trie_) { - BlockIds block_ids; - UniqueIds block_unique_ids; - std::tie(block_ids, block_unique_ids) = block_trie_->cache(seq, seq.tokens); - // TODO: + if (!block_trie_) { + return; + } + BlockIds block_ids; + UniqueIds block_unique_ids; + std::vector> nodes; + std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.tokens); + int valid = block_manager_->Verify(block_ids, block_unique_ids); + if (rank_ == 0) { + TM_LOG_INFO("[CacheGeneration] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", + seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str(), valid); + } + // remove invalid nodes from trie tree if there is any + if (valid < block_ids.size()) { + block_trie_->Remove(nodes, valid); } } @@ -383,6 +407,45 @@ void SequenceManager::AssignAndActivate(const Sequences& sequences, // } } +void SequenceManager::PrefixMatch(Sequences& sequences) { + if (!block_trie_) { + return; + } + + for (int i = 0; i < sequences.size(); i++) { + BlockIds block_ids; + UniqueIds unique_ids; + std::vector> matched_nodes; + auto& seq = const_cast(*sequences[i]); + + if (seq.cache_len != 0) { + // We only apply prefix-cache matching when seq.cache_len is 0, + // which means this seq is a brand-new sequence. + // seq.cache_len is updated after every forward iter. Refer to `LlamaBatch::Forward` + continue; + } + std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); + const int valid = block_manager_->Verify(block_ids, unique_ids); + if (rank_ == 0) { + TM_LOG_INFO("[match] session %llu, matched block_ids %s, unique_ids %s", + seq.id, serialize_vector(block_ids).c_str(), serialize_vector(unique_ids).c_str()); + TM_LOG_INFO("[match] valid blocks %d, cache_len %d", valid, seq.cache_len); + } + // remove invalid nodes from trie tree if there is any + if (valid < block_ids.size()) { + block_trie_->Remove(matched_nodes, valid); + } + BlockIds matched_blocks(block_ids.begin(), block_ids.begin() + valid); + block_manager_->Lock(matched_blocks); + // block_manager_->Touch(matched_blocks); + + seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.begin() + valid); + seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); + seq.cache_len = valid * block_seq_len_; + } + +} + auto SequenceManager::Materialize(Sequences sequences, std::vector context_lengths, const std::vector& priorities, @@ -403,25 +466,8 @@ auto SequenceManager::Materialize(Sequences sequences, // the blocks can still be preempted later VerifyAndLockCached(sequences); - if (block_trie_) { - // match prefix cache - for (int i = 0; i < sequences.size(); i++) { - BlockIds block_ids; - UniqueIds unique_ids; - std::vector> matched_nodes; - auto& seq = *sequences[i]; - - std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); - const int count = block_manager_->Verify(block_ids, unique_ids); - seq.cache_len = count * block_seq_len_; - if (rank_ == 0) { - TM_LOG_DEBUG("matched block_ids %s, unique_ids %s", serialize_vector(block_ids), serialize_vector(unique_ids), count); - TM_LOG_DEBUG("valid block count %d, cache_len %d", count, seq.cache_len); - } - // TODO: remove invalid node in `block_trie_` - // How to retrieve the invalid node root in O(1), matched_values -> matched_nodes (the match path) - } - } + PrefixMatch(sequences); + const int max_input_count = adjust(sequences, context_lengths); diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index b33775bf8d..0ab3326138 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -154,6 +154,8 @@ class SequenceManager { const BlockIds& blocks, const UniqueIds& unique_ids); + void PrefixMatch(Sequences& sequences); + private: int block_seq_len_; int rank_; From fda1e25da3504abc7b9fd72e476d172aeaab2223 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 25 Mar 2025 18:10:16 +0800 Subject: [PATCH 04/34] fix linting --- src/turbomind/models/llama/BlockTrie.cc | 32 ++++++++--------- src/turbomind/models/llama/BlockTrie.h | 34 +++++++++---------- src/turbomind/models/llama/LlamaBatch.cc | 15 +------- src/turbomind/models/llama/LlamaBatch.h | 5 ++- src/turbomind/models/llama/SequenceManager.cc | 17 +++++----- 5 files changed, 45 insertions(+), 58 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index 6bdf9de1a0..c049c5c11a 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -14,16 +14,16 @@ size_t hash(const std::vector& vec) return seed; } -BlockTrie::BlockTrie(size_t block_seq_len, std::shared_ptr block_manager): - block_seq_len_(block_seq_len), block_manager_(block_manager) +BlockTrie::BlockTrie(size_t block_len): + block_seq_len_(block_len) { root_ = std::make_shared(); } std::tuple>> BlockTrie::match(const Sequence& seq) const { - BlockIds matched_blocks; - UniqueIds matched_unique_ids; + BlockIds matched_blocks; + UniqueIds matched_unique_ids; std::vector> matched_nodes; std::shared_ptr curr_node = root_; @@ -52,28 +52,27 @@ std::tuple>> BlockTri return std::tuple(matched_blocks, matched_unique_ids, matched_nodes); } -std::tuple>> BlockTrie::cache(const Sequence& seq, const std::vector& tokens) +std::tuple>> BlockTrie::cache(const Sequence& seq, + const std::vector& tokens) { TM_LOG_INFO("[cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); FT_CHECK(seq.status != Sequence::kCached); FT_CHECK(tokens.size() <= seq.blocks.size() * block_seq_len_); - std::shared_ptr curr_node = root_; - int idx = 0; + std::shared_ptr curr_node = root_; + int idx = 0; - BlockIds cache_block_ids; - UniqueIds cache_block_unique_ids; + BlockIds cache_block_ids; + UniqueIds cache_block_unique_ids; std::vector> cache_nodes; - // // Only cache valid blocks - // int valid_blocks = block_manager_->Verify(seq.blocks, seq.block_unique_ids); - // We don't cache the last block of the sequence, since it might not be full // TODO(lvhan): determine wether the last block is full or not. It is not trivial // considering chunk prefill for (int idx = 0; idx < seq.blocks.size() - 1; ++idx) { auto start = tokens.begin() + idx * block_seq_len_; - auto end = start + block_seq_len_; + auto end = start + block_seq_len_; + std::vector curr_tokens(start, end); // TODO(lvhan): add salt to ensure the hash security size_t hash_key = hash(curr_tokens); @@ -110,15 +109,16 @@ std::tuple>> BlockTri } -void BlockTrie::Remove(const std::vector>& nodes, int valid_size) { +void BlockTrie::Remove(const std::vector>& nodes, int valid_size) +{ if (nodes.empty() || valid_size < 1 ) { return; } // visit nodes in reverse order for (int idx = nodes.size() - 1; idx >= valid_size; --idx) { - auto child = nodes[idx]; + auto child = nodes[idx]; auto parent = nodes[idx - 1]; - auto it = parent->children.find(child->hash_key); + auto it = parent->children.find(child->hash_key); FT_CHECK(it != parent->children.end()); FT_CHECK(it->second->tokens == child->tokens); parent->children.erase(it); diff --git a/src/turbomind/models/llama/BlockTrie.h b/src/turbomind/models/llama/BlockTrie.h index f6fd71fc37..953735241e 100644 --- a/src/turbomind/models/llama/BlockTrie.h +++ b/src/turbomind/models/llama/BlockTrie.h @@ -21,22 +21,22 @@ struct TrieNode { class BlockTrie { public: - explicit BlockTrie(size_t block_len, std::shared_ptr block_manager); + explicit BlockTrie(size_t block_len); /** @brief Attempt to match cached key-value (KV) blocks for a given sequence. - * - * This function iterates the tokens of the sequence and attempts - * to match them with the cached KV blocks. If the max prefix match is found, - * it returns the IDs, unique IDs, and hash keys of the matched blocks. - * - * @param seq The sequence whose tokens are to be matched against the cached KV blocks. - * @return A tuple containing the following: - * - BlockIds: A list of IDs of the matched blocks. - * - UniqueIds: A list of unique IDs of the matched blocks. - * - std::vector>: A list of matched node - * - * @note If no blocks are matched, all containers in the returned tuple will be empty. - */ + * + * This function iterates the tokens of the sequence and attempts + * to match them with the cached KV blocks. If the max prefix match is found, + * it returns the IDs, unique IDs, and hash keys of the matched blocks. + * + * @param seq The sequence whose tokens are to be matched against the cached KV blocks. + * @return A tuple containing the following: + * - BlockIds: A list of IDs of the matched blocks. + * - UniqueIds: A list of unique IDs of the matched blocks. + * - std::vector>: A list of matched node + * + * @note If no blocks are matched, all containers in the returned tuple will be empty. + */ std::tuple>> match(const Sequence& seq) const; /** @@ -53,7 +53,8 @@ class BlockTrie { * - UniqueIds: A list of unique IDs of the cached blocks. * - std::vector>: A list of cached node */ - std::tuple>> cache(const Sequence& seq, const std::vector& tokens); + std::tuple>> cache(const Sequence& seq, + const std::vector& tokens); /** @brief remove nodes[valid_size:] in a visited path from the trie tree @@ -62,11 +63,10 @@ class BlockTrie { * @note the visited path must be the returned value from `match` or `cache` */ void Remove(const std::vector>& nodes, int valid_size); + private: size_t block_seq_len_; - std::shared_ptr block_manager_; - std::shared_ptr root_; }; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 85b9ff121b..a75605ce10 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -224,16 +224,6 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectortokens.size(); } - else if (s == ptr->tokens.size()) { - if (rank_ == 0) { - TM_LOG_INFO("[ProcessInferRequests] ID %lu, step(%d) == tokens(%d)", ptr->id, s, ptr->tokens.size()); - } - } - else if (s != 0) { - if (rank_ == 0) { - TM_LOG_WARNING("[ProcessInferRequests] ID %lu, step(%d) < tokens(%d)", ptr->id, s, ptr->tokens.size()); - } - } return s; }(); @@ -252,10 +242,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorgen_cfg.max_new_tokens) + std::vector seq_len_limit; std::vector sequences; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 1c8c122f7c..4feef6bcd6 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -21,8 +21,8 @@ std::string serialize_vector(const std::vector& data) if (data.empty()) { return "nil"; } - std::stringstream ss; + auto it = data.begin(); ss << *it; @@ -49,7 +49,7 @@ SequenceManager::SequenceManager(size_t layer_num, block_manager_ = std::make_shared(block_size, block_count, chunk_size, allocator, get_free_size); if (enable_prefix_caching) { - block_trie_ = std::make_shared(block_config.block_len_, block_manager_); + block_trie_ = std::make_shared(block_config.block_len_); } } @@ -105,7 +105,7 @@ bool SequenceManager::Erase(uint64_t id) } return false; } - +//clang-format off void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) { if (!block_trie_) { @@ -138,7 +138,7 @@ void SequenceManager::CacheGeneration(const Sequence& seq) if (!block_trie_) { return; } - BlockIds block_ids; + BlockIds block_ids; UniqueIds block_unique_ids; std::vector> nodes; std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.tokens); @@ -152,7 +152,7 @@ void SequenceManager::CacheGeneration(const Sequence& seq) block_trie_->Remove(nodes, valid); } } - +// clang-format on void SequenceManager::VerifyAndLockCached(const Sequences& sequences) { BlockIds blocks; @@ -407,7 +407,9 @@ void SequenceManager::AssignAndActivate(const Sequences& sequences, // } } -void SequenceManager::PrefixMatch(Sequences& sequences) { +//clang-format off +void SequenceManager::PrefixMatch(Sequences& sequences) +{ if (!block_trie_) { return; } @@ -443,8 +445,8 @@ void SequenceManager::PrefixMatch(Sequences& sequences) { seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); seq.cache_len = valid * block_seq_len_; } - } +//clang-format on auto SequenceManager::Materialize(Sequences sequences, std::vector context_lengths, @@ -468,7 +470,6 @@ auto SequenceManager::Materialize(Sequences sequences, PrefixMatch(sequences); - const int max_input_count = adjust(sequences, context_lengths); std::vector required = CountRequiredBlocks(sequences, context_lengths, step_length); From a4ffe41db1e36c96971dcbfcb10e2c872ded7da5 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 25 Mar 2025 19:12:15 +0800 Subject: [PATCH 05/34] fix linting --- src/turbomind/models/llama/BlockTrie.cc | 8 ++-- src/turbomind/models/llama/BlockTrie.h | 4 +- src/turbomind/models/llama/SequenceManager.cc | 42 +++++++++++-------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index c049c5c11a..9708d0bcea 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -14,8 +14,7 @@ size_t hash(const std::vector& vec) return seed; } -BlockTrie::BlockTrie(size_t block_len): - block_seq_len_(block_len) +BlockTrie::BlockTrie(size_t block_len): block_seq_len_(block_len) { root_ = std::make_shared(); } @@ -52,7 +51,7 @@ std::tuple>> BlockTri return std::tuple(matched_blocks, matched_unique_ids, matched_nodes); } -std::tuple>> BlockTrie::cache(const Sequence& seq, +std::tuple>> BlockTrie::cache(const Sequence& seq, const std::vector& tokens) { TM_LOG_INFO("[cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); @@ -108,10 +107,9 @@ std::tuple>> BlockTri return std::make_tuple(cache_block_ids, cache_block_unique_ids, cache_nodes); } - void BlockTrie::Remove(const std::vector>& nodes, int valid_size) { - if (nodes.empty() || valid_size < 1 ) { + if (nodes.empty() || valid_size < 1) { return; } // visit nodes in reverse order diff --git a/src/turbomind/models/llama/BlockTrie.h b/src/turbomind/models/llama/BlockTrie.h index 953735241e..4cd272dd7f 100644 --- a/src/turbomind/models/llama/BlockTrie.h +++ b/src/turbomind/models/llama/BlockTrie.h @@ -37,7 +37,7 @@ class BlockTrie { * * @note If no blocks are matched, all containers in the returned tuple will be empty. */ - std::tuple>> match(const Sequence& seq) const; + std::tuple>> match(const Sequence& seq) const; /** * @brief Cache the key-value (KV) blocks of a given sequence. @@ -53,7 +53,7 @@ class BlockTrie { * - UniqueIds: A list of unique IDs of the cached blocks. * - std::vector>: A list of cached node */ - std::tuple>> cache(const Sequence& seq, + std::tuple>> cache(const Sequence& seq, const std::vector& tokens); /** @brief remove nodes[valid_size:] in a visited path from the trie tree diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 4feef6bcd6..17df0ed3e5 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -10,8 +10,8 @@ #include #include #include -#include #include +#include namespace turbomind { @@ -105,7 +105,7 @@ bool SequenceManager::Erase(uint64_t id) } return false; } -//clang-format off + void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) { if (!block_trie_) { @@ -117,14 +117,17 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) // seq prefill finished. We don't cache the prompt any longer continue; } - BlockIds block_ids; - UniqueIds block_unique_ids; + BlockIds block_ids; + UniqueIds block_unique_ids; std::vector> nodes; std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.prompt); - int valid = block_manager_->Verify(block_ids, block_unique_ids); + int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[CachePrompt] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", - seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str(), valid); + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(block_unique_ids).c_str(), + valid); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -138,21 +141,24 @@ void SequenceManager::CacheGeneration(const Sequence& seq) if (!block_trie_) { return; } - BlockIds block_ids; - UniqueIds block_unique_ids; + BlockIds block_ids; + UniqueIds block_unique_ids; std::vector> nodes; std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.tokens); - int valid = block_manager_->Verify(block_ids, block_unique_ids); + int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[CacheGeneration] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", - seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str(), valid); + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(block_unique_ids).c_str(), + valid); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(nodes, valid); } } -// clang-format on + void SequenceManager::VerifyAndLockCached(const Sequences& sequences) { BlockIds blocks; @@ -407,7 +413,6 @@ void SequenceManager::AssignAndActivate(const Sequences& sequences, // } } -//clang-format off void SequenceManager::PrefixMatch(Sequences& sequences) { if (!block_trie_) { @@ -415,10 +420,10 @@ void SequenceManager::PrefixMatch(Sequences& sequences) } for (int i = 0; i < sequences.size(); i++) { - BlockIds block_ids; - UniqueIds unique_ids; + BlockIds block_ids; + UniqueIds unique_ids; std::vector> matched_nodes; - auto& seq = const_cast(*sequences[i]); + auto& seq = const_cast(*sequences[i]); if (seq.cache_len != 0) { // We only apply prefix-cache matching when seq.cache_len is 0, @@ -427,10 +432,12 @@ void SequenceManager::PrefixMatch(Sequences& sequences) continue; } std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); - const int valid = block_manager_->Verify(block_ids, unique_ids); + const int valid = block_manager_->Verify(block_ids, unique_ids); if (rank_ == 0) { TM_LOG_INFO("[match] session %llu, matched block_ids %s, unique_ids %s", - seq.id, serialize_vector(block_ids).c_str(), serialize_vector(unique_ids).c_str()); + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(unique_ids).c_str()); TM_LOG_INFO("[match] valid blocks %d, cache_len %d", valid, seq.cache_len); } // remove invalid nodes from trie tree if there is any @@ -446,7 +453,6 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.cache_len = valid * block_seq_len_; } } -//clang-format on auto SequenceManager::Materialize(Sequences sequences, std::vector context_lengths, From acf40924b1a2f5bc751d18591e54d26ee76aef36 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 27 Mar 2025 16:47:38 +0800 Subject: [PATCH 06/34] combine Get&Create --- src/turbomind/models/llama/BlockTrie.cc | 6 ++-- src/turbomind/models/llama/LlamaBatch.cc | 5 +-- src/turbomind/models/llama/SequenceManager.cc | 31 +++++++++++++------ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index 9708d0bcea..c39cdecab6 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -48,13 +48,13 @@ std::tuple>> BlockTri curr_node = it->second; num_matched += block_seq_len_; } - return std::tuple(matched_blocks, matched_unique_ids, matched_nodes); + return std::make_tuple(matched_blocks, matched_unique_ids, matched_nodes); } std::tuple>> BlockTrie::cache(const Sequence& seq, const std::vector& tokens) { - TM_LOG_INFO("[cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); + TM_LOG_INFO("[BlockTrie][cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); FT_CHECK(seq.status != Sequence::kCached); FT_CHECK(tokens.size() <= seq.blocks.size() * block_seq_len_); @@ -82,7 +82,7 @@ std::tuple>> BlockTri auto it = curr_node->children.find(hash_key); if (it != curr_node->children.end()) { if (curr_tokens != it->second->tokens) { - TM_LOG_WARNING("hash key cache hit, but tokens are not the same"); + TM_LOG_WARNING("[BlockTrie][cache] hash key cache hit, but tokens are not the same"); break; } curr_node = it->second; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index a75605ce10..6b80740e99 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -192,7 +192,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorid); + TM_LOG_INFO("[ProcessInferRequests] Request for %ld received", (long)r->id); } if (r->ec) { @@ -207,7 +207,8 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); + // auto ptr = r->session.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); + auto ptr = sequence_manager_->Get(r->id); if (!ptr) { signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); }); continue; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 17df0ed3e5..7a62116c53 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -59,20 +59,33 @@ const Sequence* SequenceManager::Create(uint64_t id) auto it = sequences_.find(id); if (it != sequences_.end()) { if (rank_ == 0) { - TM_LOG_WARNING("[SequenceManager][Create] Removing conflicting ID %ld", (long)id); + TM_LOG_WARNING("[SequenceManager][Create] Removing conflicting ID %llu", id); } Erase(it); } it = sequences_.emplace_hint(it, id, std::move(sequence)); + if (rank_ == 0) { + TM_LOG_INFO("[SequenceManager][Create] ID %llu", id); + } return &it->second; } const Sequence* SequenceManager::Get(uint64_t id) { if (auto it = sequences_.find(id); it != sequences_.end()) { + if (rank_ == 0) { + TM_LOG_INFO("[SequenceManager][Get] ID %llu", id); + } + return &it->second; + } + else { + Sequence sequence{id}; + it = sequences_.emplace_hint(it, id, std::move(sequence)); + if (rank_ == 0) { + TM_LOG_INFO("[SequenceManager][Create] ID %llu", id); + } return &it->second; } - return nullptr; } bool SequenceManager::Contains(uint64_t id) @@ -433,13 +446,6 @@ void SequenceManager::PrefixMatch(Sequences& sequences) } std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); const int valid = block_manager_->Verify(block_ids, unique_ids); - if (rank_ == 0) { - TM_LOG_INFO("[match] session %llu, matched block_ids %s, unique_ids %s", - seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(unique_ids).c_str()); - TM_LOG_INFO("[match] valid blocks %d, cache_len %d", valid, seq.cache_len); - } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(matched_nodes, valid); @@ -451,6 +457,13 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.begin() + valid); seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); seq.cache_len = valid * block_seq_len_; + if (rank_ == 0) { + TM_LOG_INFO("[match] session %llu, matched block_ids %s, unique_ids %s", + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(unique_ids).c_str()); + TM_LOG_INFO("[match] valid blocks %d, cache_len %d", valid, seq.cache_len); + } } } From a2352d155ce041d9b7946b836cee8b6491176911 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 27 Mar 2025 20:39:40 +0800 Subject: [PATCH 07/34] update --- src/turbomind/models/llama/BlockTrie.cc | 23 +++++++- src/turbomind/models/llama/BlockTrie.h | 27 ++++++--- src/turbomind/models/llama/LlamaBatch.cc | 19 +++---- src/turbomind/models/llama/LlamaBatch.h | 2 +- src/turbomind/models/llama/SequenceManager.cc | 57 ++++++++++++------- 5 files changed, 85 insertions(+), 43 deletions(-) diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index c39cdecab6..c620cbdd1e 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -19,7 +19,7 @@ BlockTrie::BlockTrie(size_t block_len): block_seq_len_(block_len) root_ = std::make_shared(); } -std::tuple>> BlockTrie::match(const Sequence& seq) const +std::tuple>> BlockTrie::Match(const Sequence& seq) const { BlockIds matched_blocks; UniqueIds matched_unique_ids; @@ -51,7 +51,7 @@ std::tuple>> BlockTri return std::make_tuple(matched_blocks, matched_unique_ids, matched_nodes); } -std::tuple>> BlockTrie::cache(const Sequence& seq, +std::tuple>> BlockTrie::Cache(const Sequence& seq, const std::vector& tokens) { TM_LOG_INFO("[BlockTrie][cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); @@ -123,4 +123,23 @@ void BlockTrie::Remove(const std::vector>& nodes, int } } +void BlockTrie::Prune(ValidBlockChecker checker) +{ + return DFSPrune(root_, checker); +} + +void BlockTrie::DFSPrune(std::shared_ptr& node, ValidBlockChecker checker) +{ + for (auto it = node->children.begin(); it != node->children.end();) { + if (!checker(it->second->block_id, it->second->block_unique_id)) { + // child invalid + it = node->children.erase(it); + } + else { + DFSPrune(it->second, checker); + it++; + } + } +} + } // namespace turbomind diff --git a/src/turbomind/models/llama/BlockTrie.h b/src/turbomind/models/llama/BlockTrie.h index 4cd272dd7f..931f0f4812 100644 --- a/src/turbomind/models/llama/BlockTrie.h +++ b/src/turbomind/models/llama/BlockTrie.h @@ -23,7 +23,8 @@ class BlockTrie { public: explicit BlockTrie(size_t block_len); - /** @brief Attempt to match cached key-value (KV) blocks for a given sequence. + /** + * @brief Attempt to match cached key-value (KV) blocks for a given sequence. * * This function iterates the tokens of the sequence and attempts * to match them with the cached KV blocks. If the max prefix match is found, @@ -37,7 +38,7 @@ class BlockTrie { * * @note If no blocks are matched, all containers in the returned tuple will be empty. */ - std::tuple>> match(const Sequence& seq) const; + std::tuple>> Match(const Sequence& seq) const; /** * @brief Cache the key-value (KV) blocks of a given sequence. @@ -53,17 +54,27 @@ class BlockTrie { * - UniqueIds: A list of unique IDs of the cached blocks. * - std::vector>: A list of cached node */ - std::tuple>> cache(const Sequence& seq, + std::tuple>> Cache(const Sequence& seq, const std::vector& tokens); - /** @brief remove nodes[valid_size:] in a visited path from the trie tree + /** + * @brief remove nodes[valid_size:] in a visited path from the trie tree - * @param nodes a visited path returned by `match` or `cache` - * @param valid_size the valid number of cached blocks from the beginning of the path - * @note the visited path must be the returned value from `match` or `cache` - */ + * @param nodes a visited path returned by `match` or `cache` + * @param valid_size the valid number of cached blocks from the beginning of the path + * @note the visited path must be the returned value from `match` or `cache` + */ void Remove(const std::vector>& nodes, int valid_size); + /** + * @brief prune invalid nodes from the tree + */ + using ValidBlockChecker = std::function; + void Prune(ValidBlockChecker checker); + +private: + void DFSPrune(std::shared_ptr& node, ValidBlockChecker checker); + private: size_t block_seq_len_; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 6b80740e99..9b32061867 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1550,14 +1550,13 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) } template -auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Signal +auto LlamaBatch::Interrupt(int index, bool force_stop) -> Signal { if (rank_ == 0) { - TM_LOG_INFO("[Interrupt] slot %d, request %lu, stop %d, end %d", + TM_LOG_INFO("[Interrupt] slot %d, request %lu, stop %d", index, (long)state_->requests[index]->id, - force_stop, - force_end); + force_stop); } if (debug_ && rank_ == 0) { @@ -1571,11 +1570,11 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - if (/*state_->requests[index]->session.end_flag ||*/ force_end) { - // Sequence is ending this round or a stop request is issued to end it - FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); - } - else { + // if (state_->requests[index]->session.end_flag || force_end) { + // // Sequence is ending this round or a stop request is issued to end it + // FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); + // } + // else { const int output_len = state_->h_context_length[index]; auto& seq = *state_->sequences[index]; @@ -1595,7 +1594,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()` sequence_manager_->UpdateAndSetUnlock(seq); - } + // } state_->sequences[index] = nullptr; diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index a211f84acd..3a3421ad80 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -102,7 +102,7 @@ class LlamaBatch { void Finish(GenerationState& g, std::vector& signals); - [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); + [[nodiscard]] Signal Interrupt(int index, bool force_stop = false); void ComputeAndOutputLogits(T* hidden_states, int first, int last); diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 7a62116c53..56352e7c69 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -59,22 +59,32 @@ const Sequence* SequenceManager::Create(uint64_t id) auto it = sequences_.find(id); if (it != sequences_.end()) { if (rank_ == 0) { - TM_LOG_WARNING("[SequenceManager][Create] Removing conflicting ID %llu", id); + TM_LOG_WARNING("[SeqMgr][Create] Removing conflicting ID %llu", id); } Erase(it); } it = sequences_.emplace_hint(it, id, std::move(sequence)); if (rank_ == 0) { - TM_LOG_INFO("[SequenceManager][Create] ID %llu", id); + TM_LOG_INFO("[SeqMgr][Create] ID %llu", id); } return &it->second; } const Sequence* SequenceManager::Get(uint64_t id) { + if (!block_trie_) { + // when prefix_caching is not enabled, check if the id exists. If so, remove the older one + auto it = sequences_.find(id); + if (it != sequences_.end()) { + if (rank_ == 0) { + TM_LOG_INFO("[SeqMgr][Get] Removing conflicting ID %llu", id); + } + Erase(it); + } + } if (auto it = sequences_.find(id); it != sequences_.end()) { if (rank_ == 0) { - TM_LOG_INFO("[SequenceManager][Get] ID %llu", id); + TM_LOG_INFO("[SeqMgr][Get] ID %llu", id); } return &it->second; } @@ -82,7 +92,7 @@ const Sequence* SequenceManager::Get(uint64_t id) Sequence sequence{id}; it = sequences_.emplace_hint(it, id, std::move(sequence)); if (rank_ == 0) { - TM_LOG_INFO("[SequenceManager][Create] ID %llu", id); + TM_LOG_INFO("[SeqMgr][Get] Create ID %llu", id); } return &it->second; } @@ -103,11 +113,14 @@ void SequenceManager::Erase(std::map::iterator& it) else { UpdateAndSetUnlock(seq); } - // if prefix cache enabled, blocks will be shared by sequences, cannot be freed immediately + + it = sequences_.erase(it); if (block_trie_) { - freed_.insert(freed_.end(), seq.blocks.begin(), seq.blocks.end()); + auto is_valid = [this](int block_id, uint64_t block_unique_id) -> bool { + return this->block_manager_->unique_id(block_id) == block_unique_id; + }; + block_trie_->Prune(is_valid); } - it = sequences_.erase(it); } bool SequenceManager::Erase(uint64_t id) @@ -133,14 +146,14 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) BlockIds block_ids; UniqueIds block_unique_ids; std::vector> nodes; - std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.prompt); + std::tie(block_ids, block_unique_ids, nodes) = block_trie_->Cache(seq, seq.prompt); int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { - TM_LOG_INFO("[CachePrompt] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", - seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(block_unique_ids).c_str(), - valid); + TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, valid num %d", seq.id, block_ids.size(), valid); + TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -157,14 +170,14 @@ void SequenceManager::CacheGeneration(const Sequence& seq) BlockIds block_ids; UniqueIds block_unique_ids; std::vector> nodes; - std::tie(block_ids, block_unique_ids, nodes) = block_trie_->cache(seq, seq.tokens); + std::tie(block_ids, block_unique_ids, nodes) = block_trie_->Cache(seq, seq.tokens); int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { - TM_LOG_INFO("[CacheGeneration] session %llu, cached block_ids %s, cached block_unique_ids %s, valid %d", - seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(block_unique_ids).c_str(), - valid); + TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, valid %d", seq.id, block_ids.size(), valid); + TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", + seq.id, + serialize_vector(block_ids).c_str(), + serialize_vector(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -444,7 +457,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) // seq.cache_len is updated after every forward iter. Refer to `LlamaBatch::Forward` continue; } - std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->match(seq); + std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->Match(seq); const int valid = block_manager_->Verify(block_ids, unique_ids); // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -458,11 +471,11 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); seq.cache_len = valid * block_seq_len_; if (rank_ == 0) { - TM_LOG_INFO("[match] session %llu, matched block_ids %s, unique_ids %s", + TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(unique_ids).c_str()); - TM_LOG_INFO("[match] valid blocks %d, cache_len %d", valid, seq.cache_len); } } } From 1e940df911c79fa99a5455a66d238b22b78b5fc3 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 27 Mar 2025 21:20:02 +0800 Subject: [PATCH 08/34] clear blocks --- src/turbomind/models/llama/SequenceManager.cc | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 56352e7c69..e3719fa7bd 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -82,6 +82,10 @@ const Sequence* SequenceManager::Get(uint64_t id) Erase(it); } } + else { + // TODO: multi-round chat, same id. when prefix_caching is ON, the second round can hit + // the kv cache block occupied in the first + } if (auto it = sequences_.find(id); it != sequences_.end()) { if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][Get] ID %llu", id); @@ -141,6 +145,7 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) auto& seq = *sequences[i]; if (seq.cache_len > seq.prompt.size()) { // seq prefill finished. We don't cache the prompt any longer + seq.prompt.clear(); continue; } BlockIds block_ids; @@ -150,7 +155,7 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, valid num %d", seq.id, block_ids.size(), valid); - TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", + TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str()); @@ -174,7 +179,7 @@ void SequenceManager::CacheGeneration(const Sequence& seq) int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, valid %d", seq.id, block_ids.size(), valid); - TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", + TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str()); @@ -450,7 +455,6 @@ void SequenceManager::PrefixMatch(Sequences& sequences) UniqueIds unique_ids; std::vector> matched_nodes; auto& seq = const_cast(*sequences[i]); - if (seq.cache_len != 0) { // We only apply prefix-cache matching when seq.cache_len is 0, // which means this seq is a brand-new sequence. @@ -466,13 +470,18 @@ void SequenceManager::PrefixMatch(Sequences& sequences) BlockIds matched_blocks(block_ids.begin(), block_ids.begin() + valid); block_manager_->Lock(matched_blocks); // block_manager_->Touch(matched_blocks); - + if (!seq.blocks.empty()) { + // seq.cache_len = 0 but seq.blocks is not empty. It means this seq is a reused seq + // the new seq's ID is reused. + seq.blocks.clear(); + seq.block_unique_ids.clear(); + } seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.begin() + valid); seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); seq.cache_len = valid * block_seq_len_; if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); - TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", + TM_LOG_INFO("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(unique_ids).c_str()); From 533941d4ea38d20dbccc44d3907ff451854985ba Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 28 Mar 2025 14:07:32 +0800 Subject: [PATCH 09/34] INFO log to DEBUG log --- src/turbomind/models/llama/SequenceManager.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index e3719fa7bd..e8c3e244aa 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -155,7 +155,7 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, valid num %d", seq.id, block_ids.size(), valid); - TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", + TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str()); @@ -179,7 +179,7 @@ void SequenceManager::CacheGeneration(const Sequence& seq) int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, valid %d", seq.id, block_ids.size(), valid); - TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", + TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(block_unique_ids).c_str()); @@ -315,8 +315,6 @@ struct Transaction { const Sequences& sequences_; Schedule& schedule_; - std::shared_ptr block_trie_; - explicit Transaction(const Sequences& sequences, int index, int block_count, int input_count, Schedule& sched): sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count), input_count_(input_count) { @@ -471,8 +469,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) block_manager_->Lock(matched_blocks); // block_manager_->Touch(matched_blocks); if (!seq.blocks.empty()) { - // seq.cache_len = 0 but seq.blocks is not empty. It means this seq is a reused seq - // the new seq's ID is reused. + // seq.cache_len == 0 but seq.blocks is not empty. It means the new seq reuses an older seq's ID seq.blocks.clear(); seq.block_unique_ids.clear(); } @@ -481,7 +478,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.cache_len = valid * block_seq_len_; if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); - TM_LOG_INFO("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id, serialize_vector(block_ids).c_str(), serialize_vector(unique_ids).c_str()); From 91d1412e0b4567e80253028dc777811a2638d590 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 28 Mar 2025 15:11:17 +0800 Subject: [PATCH 10/34] refactor chat.py --- lmdeploy/turbomind/chat.py | 61 +++++++----------------- src/turbomind/models/llama/LlamaBatch.cc | 12 ++--- 2 files changed, 23 insertions(+), 50 deletions(-) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index dd4c1fe3a0..aa2e38402f 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -9,13 +9,13 @@ from lmdeploy.model import ChatTemplateConfig from lmdeploy.serve.async_engine import get_names_from_model from lmdeploy.tokenizer import DetokenizeState -from lmdeploy.utils import _get_and_verify_max_len, _stop_words, get_hf_gen_cfg +from lmdeploy.utils import _get_and_verify_max_len, get_hf_gen_cfg, get_logger log_level = 'ERROR' +logger = get_logger('lmdeploy') + if os.getenv('TM_LOG_LEVEL') is None: os.environ['TM_LOG_LEVEL'] = log_level - from lmdeploy.utils import get_logger - logger = get_logger('lmdeploy') logger.setLevel(log_level) @@ -30,16 +30,12 @@ def input_prompt(model_name): return '\n'.join(iter(input, sentinel)) -async def async_infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, tokenizer, - state): +async def async_infer(generator, session_id, input_ids, gen_config, stream_output, tokenizer, state): token_ids = input_ids.copy() prev_len = 0 async for output in generator.async_stream_infer(session_id=session_id, input_ids=input_ids, gen_config=gen_config, - sequence_start=sequence_start, - sequence_end=False, - step=step, stream_output=stream_output): tokens = output.num_token if tokens > prev_len: @@ -64,7 +60,7 @@ def main(model_path: str, cache_max_entry_count: float = 0.8, cache_block_seq_len: int = 64, rope_scaling_factor: float = 0.0, - enable_prefix_caching: bool = False, + enable_prefix_caching: bool = True, session_len: int = None, stream_output: bool = True, request_output_len: int = 1024, @@ -116,7 +112,7 @@ def main(model_path: str, if chat_template_config.capability is None: chat_template_config.capability = cap print('chat_template_config:\n', chat_template_config, sep='', flush=True) - model = chat_template_config.chat_template + chat_template = chat_template_config.chat_template _, model_config = get_model_arch(model_path) session_len = _get_and_verify_max_len(model_config, session_len) @@ -145,59 +141,36 @@ def main(model_path: str, top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty) - stop_words = _stop_words(model.stop_words, tokenizer) - gen_config.convert_stop_bad_words_to_ids(tokenizer) - if stop_words is not None: - stop_words = stop_words[0][0].tolist() - if gen_config.stop_token_ids is None: - gen_config.stop_token_ids = stop_words + hf_gen_cfg = get_hf_gen_cfg(model_path) gen_config.update_from_hf_gen_cfg(hf_gen_cfg, tokenizer.eos_token_id) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - nth_round = 1 - step = 0 seed = random.getrandbits(64) + messages = [] while True: - prompt = input_prompt(chat_template_name) - if prompt == 'exit': + user_input = input_prompt(chat_template_name) + if user_input == 'exit': exit(0) - elif prompt == 'end': + elif user_input == 'end': loop.run_until_complete(generator.async_end(session_id)) - nth_round = 1 - step = 0 seed = random.getrandbits(64) + messages = [] else: - prompt = model.get_prompt(prompt, nth_round == 1) - input_ids = tokenizer.encode(prompt, nth_round == 1) + messages.append(dict(role='user', content=user_input)) + prompt = chat_template.messages2prompt(messages) + input_ids = tokenizer.encode(prompt) gen_config.random_seed = seed - if model.capability == 'chat': - sequence_start = (nth_round == 1) - else: - sequence_start = True - step = 0 - - if step + len(input_ids) + request_output_len >= tm_model.session_len: - print('WARNING: exceed session max length.' - ' Please end the session.') - continue - - print(f'{prompt}', end='', flush=True) state = DetokenizeState(len(input_ids)) - coro = async_infer(generator, session_id, input_ids, gen_config, sequence_start, step, stream_output, - tokenizer, state) - tokens = loop.run_until_complete(coro) + coro = async_infer(generator, session_id, input_ids, gen_config, stream_output, tokenizer, state) + loop.run_until_complete(coro) - # update step - step += len(input_ids) + tokens print() - nth_round += 1 - if __name__ == '__main__': import fire diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 9b32061867..688ac0f8d5 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -240,12 +240,12 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorinputs.getPtr("input_ids"); From ce089741995cc197fd69e7066111f892e8860319 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 28 Mar 2025 20:02:34 +0800 Subject: [PATCH 11/34] unlock the unmatched blocks when id is reused --- src/turbomind/models/llama/LlamaBatch.cc | 8 -- src/turbomind/models/llama/SequenceManager.cc | 99 +++++++++++++------ 2 files changed, 67 insertions(+), 40 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 688ac0f8d5..92cff9983c 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -207,7 +207,6 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id); auto ptr = sequence_manager_->Get(r->id); if (!ptr) { signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); }); @@ -240,13 +239,6 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorinputs.getPtr("input_ids"); { diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index e8c3e244aa..b99c20d445 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -16,7 +16,7 @@ namespace turbomind { template -std::string serialize_vector(const std::vector& data) +std::string vector2string(const std::vector& data) { if (data.empty()) { return "nil"; @@ -72,9 +72,9 @@ const Sequence* SequenceManager::Create(uint64_t id) const Sequence* SequenceManager::Get(uint64_t id) { + auto it = sequences_.find(id); if (!block_trie_) { // when prefix_caching is not enabled, check if the id exists. If so, remove the older one - auto it = sequences_.find(id); if (it != sequences_.end()) { if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][Get] Removing conflicting ID %llu", id); @@ -83,23 +83,27 @@ const Sequence* SequenceManager::Get(uint64_t id) } } else { - // TODO: multi-round chat, same id. when prefix_caching is ON, the second round can hit - // the kv cache block occupied in the first - } - if (auto it = sequences_.find(id); it != sequences_.end()) { - if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][Get] ID %llu", id); + if (it != sequences_.end()) { + if (rank_ == 0) { + TM_LOG_INFO("[SeqMgr][Get] Reuse ID %llu, reset the mutable variables of the sequence", id); + } + auto &seq = it->second; + seq.prompt.clear(); + seq.tokens.clear(); + seq.cache_len = 0; + seq.random_state.clear(); + seq.rope_theta = 0.f; + seq.input_embeddings.clear(); + seq.input_embedding_ranges.clear(); + return &it->second; } - return &it->second; } - else { - Sequence sequence{id}; - it = sequences_.emplace_hint(it, id, std::move(sequence)); - if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][Get] Create ID %llu", id); - } - return &it->second; + Sequence sequence{id}; + it = sequences_.emplace_hint(it, id, std::move(sequence)); + if (rank_ == 0) { + TM_LOG_INFO("[SeqMgr][Get] Create ID %llu", id); } + return &it->second; } bool SequenceManager::Contains(uint64_t id) @@ -154,11 +158,15 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) std::tie(block_ids, block_unique_ids, nodes) = block_trie_->Cache(seq, seq.prompt); int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, valid num %d", seq.id, block_ids.size(), valid); - TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, block_unique_ids %s", + TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached blocks %d, tokens %d, valid blocks %d", + seq.id, + block_ids.size(), + seq.prompt.size(), + valid); + TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, unique_ids %s", seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(block_unique_ids).c_str()); + vector2string(block_ids).c_str(), + vector2string(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -178,11 +186,15 @@ void SequenceManager::CacheGeneration(const Sequence& seq) std::tie(block_ids, block_unique_ids, nodes) = block_trie_->Cache(seq, seq.tokens); int valid = block_manager_->Verify(block_ids, block_unique_ids); if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, valid %d", seq.id, block_ids.size(), valid); - TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, cached block_unique_ids %s", + TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached blocks %d, tokens %d, valid blocks %d", + seq.id, + block_ids.size(), + seq.tokens.size(), + valid); + TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(block_unique_ids).c_str()); + vector2string(block_ids).c_str(), + vector2string(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -465,23 +477,46 @@ void SequenceManager::PrefixMatch(Sequences& sequences) if (valid < block_ids.size()) { block_trie_->Remove(matched_nodes, valid); } - BlockIds matched_blocks(block_ids.begin(), block_ids.begin() + valid); - block_manager_->Lock(matched_blocks); - // block_manager_->Touch(matched_blocks); + + BlockIds matched_ids(block_ids.begin(), block_ids.begin() + valid); + block_manager_->Lock(matched_ids); + // block_manager_->Touch(matched_ids); + if (rank_ == 0) { + TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); + TM_LOG_INFO("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", + seq.id, + vector2string(block_ids).c_str(), + vector2string(unique_ids).c_str()); + } + if (!seq.blocks.empty()) { // seq.cache_len == 0 but seq.blocks is not empty. It means the new seq reuses an older seq's ID + // So we should UNLOCK the unmatched blocks and reset seq.blocks as matched_blockes + BlockIds unmatched_ids; + std::set_difference(seq.blocks.begin(), seq.blocks.end(), matched_ids.begin(), matched_ids.end(), + std::inserter(unmatched_ids, unmatched_ids.begin())); + block_manager_->Unlock(unmatched_ids); seq.blocks.clear(); seq.block_unique_ids.clear(); + if (rank_ == 0) { + TM_LOG_INFO("[SegMgr][match] ID %llu, unlock unmatched blocks %d", seq.id, unmatched_ids.size()); + TM_LOG_INFO("[SegMgr][match] ID %llu, unmatched block_ids %s", + seq.id, + vector2string(unmatched_ids).c_str()); + } } + seq.cache_len = valid * block_seq_len_; seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.begin() + valid); seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); - seq.cache_len = valid * block_seq_len_; if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); - TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", + TM_LOG_INFO("[SeqMgr][match] ID %llu, after matching, blocks %d, cache_len %d", + seq.id, + seq.blocks.size(), + seq.cache_len); + TM_LOG_INFO("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", seq.id, - serialize_vector(block_ids).c_str(), - serialize_vector(unique_ids).c_str()); + vector2string(seq.blocks).c_str(), + vector2string(seq.block_unique_ids).c_str()); } } } From 9c3ebc8373cf2b09204e5491883ce3b879874d9d Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 31 Mar 2025 14:56:28 +0800 Subject: [PATCH 12/34] remove start_flag and end_flag from tm csrc --- lmdeploy/turbomind/turbomind.py | 2 +- src/turbomind/engine/gateway.h | 22 ++++++++++---------- src/turbomind/engine/model_request.cc | 4 ++-- src/turbomind/engine/request.h | 4 ++-- src/turbomind/engine/request_queue.h | 10 ++++----- src/turbomind/models/llama/LlamaBatch.cc | 26 ++++++++++++------------ src/turbomind/python/bind.cpp | 15 +++----------- 7 files changed, 37 insertions(+), 46 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 8d43923109..73920ae00e 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -575,7 +575,7 @@ async def async_stream_infer(self, input_embedding_ranges=input_embedding_ranges, gen_config=gen_config) - session = _tm.SessionParam(id=session_id, step=step, start=sequence_start, end=sequence_end) + session = _tm.SessionParam(id=session_id, step=step) inputs = _np_dict_to_tm_dict(inputs) diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h index 8350822046..dee7783c5e 100644 --- a/src/turbomind/engine/gateway.h +++ b/src/turbomind/engine/gateway.h @@ -68,13 +68,13 @@ class Gateway { { int rank = -1; - if (!r->session.start_flag) { - // route to corresponding rank - rank = seqid2rank_.find(r->session.id); - } - else { + // if (!r->session.start_flag) { + // // route to corresponding rank + // rank = seqid2rank_.find(r->session.id); + // } + // else { rank = next_.fetch_add(1, std::memory_order_relaxed) % size_; - } + // } if (rank >= 0) { queues_[rank]->push({std::move(r)}); @@ -129,11 +129,11 @@ class Gateway { // Bind for stateful inference std::vector bind_ids; - for (const auto& r : infer_reqs) { - if (r->session.start_flag && !r->session.end_flag) { // started but not ended - bind_ids.push_back(r->session.id); - } - } + // for (const auto& r : infer_reqs) { + // if (r->session.start_flag && !r->session.end_flag) { // started but not ended + // bind_ids.push_back(r->session.id); + // } + // } if (!bind_ids.empty()) { seqid2rank_.bind(bind_ids, rank); } diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 6ba355e896..dfb923deed 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -149,9 +149,9 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output auto state = std::make_shared(); - if (param.session.start_flag) { + // if (param.session.start_flag) { session_id_ = param.session.id; - } + // } r->id = param.session.id; r->session = param.session; diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index 28f2943b54..33d4e7bc99 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -82,8 +82,8 @@ struct SessionParam { int step; - bool start_flag; - bool end_flag; + // bool start_flag; + // bool end_flag; bool kill_flag; }; diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h index 590578bf8a..a0740ab41e 100644 --- a/src/turbomind/engine/request_queue.h +++ b/src/turbomind/engine/request_queue.h @@ -47,16 +47,16 @@ class RequestQueue { auto it = queue_.begin(); int count{}; while (rs.size() < max_rs_size && count < max_count && it != queue_.end()) { - if (!(*it)->session.start_flag) { + // if (!(*it)->session.start_flag) { rs.push_back(std::move(*it)); ++count; auto tmp = it; ++it; queue_.erase(tmp); - } - else { - ++it; - } + // } + // else { + // ++it; + // } } return count; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 4c6f5c8543..07a4880187 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -276,7 +276,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag && !r->inputs.isExist("input_embedding_ranges")) { + if (input_length /*&& r->session.start_flag*/ && !r->inputs.isExist("input_embedding_ranges")) { // TODO: truncate prompt to enable prefix caching for VLM seq.prompt.resize(input_length); std::copy_n(input_ids, input_length, seq.prompt.data()); @@ -354,7 +354,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag) { + // if (r->session.start_flag) { seq.rope_theta = model_->attn_param_.rope.base; if (model_->attn_param_.rope.type == RopeType::kDynamic) { auto scaling_factor = model_->attn_param_.rope.factor; @@ -372,18 +372,18 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag) { + // if (r->session.start_flag) { // prepare to initialize random state for new sequence h_random_seed_[idx] = r->gen_cfg.random_seed; - } - else { - // Recover device states if not a new sequence - h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data(); - existing_idx.push_back(idx); - } + // } + // else { + // // Recover device states if not a new sequence + // h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data(); + // existing_idx.push_back(idx); + // } // increment pointer idx++; @@ -1278,7 +1278,7 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener for (int i = first; i < last; ++i) { - const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const int input_len = h_input_length_buf_[i]; // input length for this iter const float* src_ptr = logits; logits += (is_all ? input_len : 1) * model_->vocab_size_padded_; @@ -1507,9 +1507,9 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) for (int i = 0; i < batch_size - g.partial; ++i) { if (state_->h_finished[i]) { ++g.finished_count; - if (!state_->requests[i]->session.end_flag) { + // if (!state_->requests[i]->session.end_flag) { need_sync = true; - } + // } } } if (need_sync) { diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 1dea57375b..dbd69fcaf7 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -302,25 +302,16 @@ struct ScopedGIL { PYBIND11_MODULE(_turbomind, m) { py::class_(m, "SessionParam") - .def(py::init([](uint64_t id, int step, bool start, bool end) { - if (!start && end) { - throw std::logic_error("unsupported arguments: start=false, end=true"); - } + .def(py::init([](uint64_t id, int step) { ft::SessionParam param{}; param.id = id; param.step = step; - param.start_flag = start; - param.end_flag = end; return param; }), "id"_a, - "step"_a, - "start"_a, - "end"_a) + "step"_a) .def_readwrite("id", &ft::SessionParam::id) - .def_readwrite("step", &ft::SessionParam::step) - .def_readwrite("start", &ft::SessionParam::start_flag) - .def_readwrite("end", &ft::SessionParam::end_flag); + .def_readwrite("step", &ft::SessionParam::step); py::class_(m, "GenerationConfig") .def(py::init()) From d41683a171da4fdfd0df0fcd2f1cdc3969573e5f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 1 Apr 2025 15:05:38 +0800 Subject: [PATCH 13/34] update output_logits --- lmdeploy/serve/utils.py | 3 +- src/turbomind/engine/model_request.cc | 12 +-- src/turbomind/models/llama/LlamaBatch.cc | 77 +++++++++++-------- src/turbomind/models/llama/SequenceManager.cc | 7 +- src/turbomind/models/llama/SequenceManager.h | 8 +- 5 files changed, 63 insertions(+), 44 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 2e81ab8502..4b04761f80 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -70,7 +70,8 @@ async def _proc(i): step=steps[i] if steps else 0) as gen: async for outputs in gen: pass - logits[i] = outputs.logits[:input_len, :] + logits[i] = outputs.logits[:input_len - steps[i], :] + logger.info(f'logits[{i}].shape: {logits[i].shape}, input_len: {input_len}, step: {steps[i]}') session_ids = list(range(len(input_ids))) tasks = [_proc(i) for i in range(len(input_ids))] diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index dfb923deed..c1ded60c3b 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -111,9 +111,9 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` // is used instead const int max_seq_len = session_len_ + 1; - const int max_out_len = std::min(output_len, session_len_) + 1; - // This does not include histroy length in interactive mode - const int max_in_out_len = std::min(input_len + output_len, session_len_) + 1; + const int max_out_len = std::min(output_len, session_len_); + // This does not include history length in interactive mode + const int max_in_out_len = std::min(input_len + output_len, session_len_); for (auto& [k, v] : *param.tensors) { inputs_->emplace(k, v); @@ -123,13 +123,15 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); if (param.gen_cfg.output_logits) { - const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len; + const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len - param.session.step : max_out_len; add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, len, vocab_size_); + TM_LOG_INFO("[ModelRequest][forward] ID %llu, output_logits len %d", param.session.id, len); } if (param.gen_cfg.output_last_hidden_state) { - const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len; + const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len - param.session.step : max_out_len; add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_); + TM_LOG_INFO("[ModelRequest][forward] ID %llu, output_last_hidden_state len %d", param.session.id, len); } if (param.gen_cfg.output_logprobs) { diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 07a4880187..8a30356f9b 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -224,24 +224,24 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.step; - if (s < 0) { - s = ptr->tokens.size(); - } - else if (s > ptr->tokens.size()) { - if (tp_rank_ == 0) { - TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id); - } - s = ptr->tokens.size(); - } - return s; - }(); + // const int step = [&] { + // int s = r->session.step; + // if (s < 0) { + // s = ptr->tokens.size(); + // } + // else if (s > ptr->tokens.size()) { + // if (tp_rank_ == 0) { + // TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id); + // } + // s = ptr->tokens.size(); + // } + // return s; + // }(); - if (step + input_length > session_len_) { - signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); - continue; - } + // if (step + input_length > session_len_) { + // signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); + // continue; + // } FT_CHECK(!state.requests[idx]); @@ -276,12 +276,16 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag*/ && !r->inputs.isExist("input_embedding_ranges")) { - // TODO: truncate prompt to enable prefix caching for VLM + if (input_length && !r->inputs.isExist("input_embedding_ranges")) { seq.prompt.resize(input_length); std::copy_n(input_ids, input_length, seq.prompt.data()); + seq.prefix_match_end_index = input_length; + if (r->gen_cfg.output_logits || r->gen_cfg.output_last_hidden_state) { + // when output logits or output last hidden state, prefix match can only + // apply to prompts[0:step) + seq.prefix_match_end_index = r->session.step; + } } - // copy input embeddings if (r->inputs.isExist("input_embedding_ranges")) { const auto range_tensor = r->inputs.at("input_embedding_ranges"); @@ -1221,14 +1225,18 @@ void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last int token_num = 0; bool found = false; for (int i = first; i < last; ++i) { + const auto& s = *state_->sequences[i]; if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) { - const auto& s = *state_->sequences[i]; // Skip when the seq is filling missed cache only if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) { found = true; } } token_num += h_input_length_buf_[i]; + if (tp_rank_ == 0) { + TM_LOG_INFO("[compute_logits] ID %llu, cache_len %d, input_len %d, tokens %d, total tokens %d", + s.id, s.cache_len, h_input_length_buf_[i], s.tokens.size(), token_num); + } } if (!found) { @@ -1273,7 +1281,7 @@ void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last template void LlamaBatch::OutputLogits(const float* logits, int first, int last, GenerationConfig::OutType out_type) { - // when `is_all` is true, logits only contains last token of the sequences + // when `is_all` is false, logits only contains last token of the sequences const bool is_all = out_type == GenerationConfig::kAll; for (int i = first; i < last; ++i) { @@ -1298,16 +1306,16 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener int diff = (history_len + offset) - cache_len; - const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); + const int valid_len = input_len - std::max(0, diff); - // TM_LOG_ERROR("%d %d %d %d %d %d %d", - // history_len, - // offset, - // cache_len, - // input_len, - // valid_len, - // std::max(0, diff), - // std::max(0, -diff)); + TM_LOG_INFO("[output_logits] %d %d %d %d %d %d %d", + history_len, + offset, + cache_len, + input_len, + valid_len, + std::max(0, diff), + std::max(0, -diff)); if (valid_len <= 0) { continue; @@ -1315,10 +1323,10 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener if (is_all) { // Skip invalid tokens caused by cache miss - src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_; + src_ptr += std::max(0, diff) * model_->vocab_size_padded_; } - // Skip previous chunks - dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_; + // // Skip previous chunks + // dst_ptr += std::max(0, -diff) * model_->vocab_size_; check_cuda_error(cudaMemcpy2DAsync(dst_ptr, sizeof(float) * model_->vocab_size_, @@ -1328,6 +1336,7 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener valid_len, cudaMemcpyDefault, stream_)); + dst_ptr += valid_len * model_->vocab_size_; } } } diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index d792616546..215a205d69 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -471,12 +471,17 @@ void SequenceManager::PrefixMatch(Sequences& sequences) // seq.cache_len is updated after every forward iter. Refer to `LlamaBatch::Forward` continue; } + if (seq.prefix_match_end_index < block_seq_len_) { + continue; + } std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->Match(seq); - const int valid = block_manager_->Verify(block_ids, unique_ids); + + int valid = block_manager_->Verify(block_ids, unique_ids); // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(matched_nodes, valid); } + valid = std::min(valid, seq.prefix_match_end_index / block_seq_len_); BlockIds matched_ids(block_ids.begin(), block_ids.begin() + valid); block_manager_->Lock(matched_ids); diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index 0ab3326138..f6b31031ff 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -23,14 +23,16 @@ struct Sequence { BlockIds blocks; UniqueIds block_unique_ids; - int input_length = 0; + int input_length = 0; // the number of tokens to be processed in each forward iter mutable std::vector prompt; - - mutable std::vector tokens; // update by user + mutable std::vector tokens; // update by user or when the sequence is finished mutable int cache_len = 0; + // since which token of a sequence that prefix match won't apply + mutable int prefix_match_end_index = 0; + // additional data kept round-to-round mutable std::vector random_state; // update by user From 70399b4399c8ebff6da784c32928f087e3183a99 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 1 Apr 2025 22:16:28 +0800 Subject: [PATCH 14/34] update --- src/turbomind/models/llama/LlamaBatch.cc | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 8a30356f9b..085cd60c4b 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1233,10 +1233,6 @@ void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last } } token_num += h_input_length_buf_[i]; - if (tp_rank_ == 0) { - TM_LOG_INFO("[compute_logits] ID %llu, cache_len %d, input_len %d, tokens %d, total tokens %d", - s.id, s.cache_len, h_input_length_buf_[i], s.tokens.size(), token_num); - } } if (!found) { @@ -1296,7 +1292,7 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener auto dst_ptr = state_->requests[i]->outputs.getPtr("logits"); const int cache_len = state_->sequences[i]->cache_len; - const int history_len = state_->sequences[i]->tokens.size(); + const int history_len = state_->requests[i]->session.step; // ----------H------I-------P----------- // C C C C @@ -1325,8 +1321,8 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener // Skip invalid tokens caused by cache miss src_ptr += std::max(0, diff) * model_->vocab_size_padded_; } - // // Skip previous chunks - // dst_ptr += std::max(0, -diff) * model_->vocab_size_; + // Skip previous chunks + dst_ptr += std::max(0, -diff) * model_->vocab_size_; check_cuda_error(cudaMemcpy2DAsync(dst_ptr, sizeof(float) * model_->vocab_size_, @@ -1336,7 +1332,6 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener valid_len, cudaMemcpyDefault, stream_)); - dst_ptr += valid_len * model_->vocab_size_; } } } From 1b99728a4519737e4b9b5ff5b219ad36fa1fd14c Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 2 Apr 2025 18:41:04 +0800 Subject: [PATCH 15/34] update --- .github/workflows/unit-test.yml | 1 + generate.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 5456a3d668..5a39caa859 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -72,6 +72,7 @@ jobs: -DUSE_NVTX=ON \ -DSM=80 \ -DCMAKE_CUDA_ARCHITECTURES=80 \ + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ -DBUILD_TEST=OFF make -j$(nproc) && make install - name: Install lmdeploy diff --git a/generate.sh b/generate.sh index 0c25b8cbf2..5e21d50885 100755 --- a/generate.sh +++ b/generate.sh @@ -14,4 +14,5 @@ cmake ${builder} .. \ -DBUILD_PY_FFI=ON \ -DBUILD_MULTI_GPU=ON \ -DCMAKE_CUDA_FLAGS="-lineinfo" \ + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ -DUSE_NVTX=ON From c5a29624e195cb5cba13d98aeed43c19bc3b58cb Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 00:29:14 +0800 Subject: [PATCH 16/34] fix api_client --- lmdeploy/serve/openai/api_client.py | 155 +++++----------------------- 1 file changed, 24 insertions(+), 131 deletions(-) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index 79fd045701..21ca040d2d 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import requests @@ -168,70 +168,6 @@ def chat_completions_v1(self, output = json_loads(decoded) yield output - def chat_interactive_v1(self, - prompt: Union[str, List[Dict[str, str]]], - image_url: Optional[Union[str, List[str]]] = None, - session_id: int = -1, - interactive_mode: bool = False, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = None, - request_output_len: Optional[int] = None, - top_p: float = 0.8, - top_k: int = 40, - temperature: float = 0.8, - repetition_penalty: float = 1.0, - ignore_eos: bool = False, - skip_special_tokens: Optional[bool] = True, - adapter_name: Optional[str] = None, - **kwargs): - """Interactive completions. - - - On interactive mode, the chat history is kept on the server. Please - set `interactive_mode = True`. - - On normal mode, no chat history is kept on the server. Set - `interactive_mode = False`. - - Args: - prompt: the prompt to use for the generation. - image_url (str | List[str] | None): the image url or base64 encoded - string for VL models. - session_id: determine which instance will be called. - If not specified with a value other than -1, using random value - directly. - interactive_mode (bool): turn on interactive mode or not. On - interactive mode, session history is kept on the server (and - vice versa). - stream: whether to stream the results or not. - stop (str | List[str] | None): To stop generating further tokens. - Only accept stop words that's encoded to one token idex. - request_output_len (int): output token nums. If not specified, - will use maximum possible number for a session. - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or - higher are kept for generation. - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - temperature (float): to modulate the next token probability - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens - in the decoding. Default to be True. - adapter_name (str): For slora inference. Choose which lora to do - the inference. - - Yields: - json objects consist of text, tokens, input_tokens, - history_tokens, finish_reason - """ - pload = {k: v for k, v in locals().copy().items() if k[:2] != '__' and k not in ['self']} - response = requests.post(self.chat_intractive_v1_url, headers=self.headers, json=pload, stream=stream) - for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b'\n'): - if chunk: - decoded = chunk.decode('utf-8') - output = json_loads(decoded) - yield output - def completions_v1( self, model: str, @@ -304,7 +240,7 @@ def completions_v1( yield output def chat(self, - prompt: str, + messages: List[str], session_id: int, image_url: Optional[Union[str, List[str]]] = None, request_output_len: int = 512, @@ -317,7 +253,7 @@ def chat(self, """Chat with a unique session_id. Args: - prompt: the prompt to use for the generation. + messages(List): the chat context, including history session_id: determine which instance will be called. If not specified with a value other than -1, using random value directly. @@ -340,35 +276,23 @@ def chat(self, text, tokens, finish_reason """ assert session_id != -1, 'please set a value other than -1' - for outputs in self.chat_interactive_v1(prompt, + for outputs in self.chat_completions_v1(model=self.available_models[0], + messages=messages, + temperature=temperature, + top_p=top_p, session_id=session_id, image_url=image_url, - request_output_len=request_output_len, - interactive_mode=True, - stream=stream, + max_tokens=request_output_len, + stream=True, top_k=top_k, - top_p=top_p, - temperature=temperature, repetition_penalty=repetition_penalty, ignore_eos=ignore_eos): - if outputs['finish_reason'] == 'length' and outputs['tokens'] == 0: + finish_reason = outputs['choices'][0]['finish_reason'] + content = outputs['choices'][0]['delta']['content'] + if finish_reason == 'length' and content == '': print('WARNING: exceed session max length.' ' Please end the session.') - yield outputs['text'], outputs['tokens'], outputs['finish_reason'] - - def end_session(self, session_id: int): - """End the session with a unique session_id. - - Args: - session_id: determine which instance will be called. - If not specified with a value other than -1, using random value - directly. - """ - for out in self.chat_interactive_v1(prompt='', - session_id=session_id, - request_output_len=0, - interactive_mode=False): - pass + yield content, finish_reason def input_prompt(): @@ -378,62 +302,31 @@ def input_prompt(): return '\n'.join(iter(input, sentinel)) -def get_streaming_response(prompt: str, - api_url: str, - session_id: int, - request_output_len: int = 512, - stream: bool = True, - interactive_mode: bool = False, - ignore_eos: bool = False, - cancel: bool = False, - top_p: float = 0.8, - temperature: float = 0.7, - api_key: Optional[str] = None) -> Iterable[List[str]]: - headers = {'User-Agent': 'Test Client'} - if api_key is not None: - headers['Authorization'] = f'Bearer {api_key}' - pload = { - 'prompt': prompt, - 'stream': stream, - 'session_id': session_id, - 'request_output_len': request_output_len, - 'interactive_mode': interactive_mode, - 'ignore_eos': ignore_eos, - 'cancel': cancel, - 'top_p': top_p, - 'temperature': temperature - } - response = requests.post(api_url, headers=headers, json=pload, stream=stream) - for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b'\n'): - if chunk: - data = json_loads(chunk.decode('utf-8')) - output = data.pop('text', '') - tokens = data.pop('tokens', 0) - finish_reason = data.pop('finish_reason', None) - yield output, tokens, finish_reason - - def main(api_server_url: str = 'http://0.0.0.0:23333', session_id: int = 0, api_key: Optional[str] = None): """Main function to chat in terminal.""" if not api_server_url.startswith('http://'): - print(f'[WARNING] api_server_url of the api_server should ' - f'start with "http://", but got "{api_server_url}"') + print(f'[WARNING] api_server_url should start with "http://", but got "{api_server_url}"') # noqa: E231 api_server_url = 'http://' + api_server_url.strip() api_client = APIClient(api_server_url, api_key=api_key) + messages = [] while True: prompt = input_prompt() if prompt in ['exit', 'end']: - api_client.end_session(session_id) + messages = [] if prompt == 'exit': exit(0) else: - for text, tokens, finish_reason in api_client.chat(prompt, - session_id=session_id, - request_output_len=512, - stream=True): + messages.append(dict(role='user', content=prompt)) + response = [] + for text, finish_reason in api_client.chat(messages, + session_id=session_id, + request_output_len=512, + stream=True): if finish_reason == 'length': continue print(text, end='') + response.append(text) + messages.append(dict(role='assistant', content=''.join(response))) if __name__ == '__main__': From 499b7095021197a23cec7d4ed2b44cf47a76d8a7 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 09:53:51 +0800 Subject: [PATCH 17/34] remove interactive chat API --- .../restful/test_restful_chat_func.py | 470 ------------------ autotest/utils/run_restful_chat.py | 50 -- docs/en/llm/api_server.md | 22 - docs/zh_cn/llm/api_server.md | 22 - lmdeploy/serve/openai/api_server.py | 160 +----- 5 files changed, 11 insertions(+), 713 deletions(-) diff --git a/autotest/interface/restful/test_restful_chat_func.py b/autotest/interface/restful/test_restful_chat_func.py index b272439598..0aa5d7b771 100644 --- a/autotest/interface/restful/test_restful_chat_func.py +++ b/autotest/interface/restful/test_restful_chat_func.py @@ -1,11 +1,8 @@ -import random from concurrent.futures import ThreadPoolExecutor -from random import randint import pytest from tqdm import tqdm from utils.restful_return_check import (assert_chat_completions_batch_return, assert_chat_completions_stream_return, - assert_chat_interactive_batch_return, assert_chat_interactive_stream_return, get_repeat_times) from lmdeploy.serve.openai.api_client import APIClient, get_model_list @@ -66,13 +63,6 @@ def process_one(question): msg = [dict(role='user', content=question)] - data = api_client.chat_interactive_v1(msg, - session_id=randint(1, 100), - repetition_penalty=1.02, - request_output_len=224) - for item in data: - pass - data = api_client.chat_completions_v1(model=model_name, messages=msg, repetition_penalty=1.02, @@ -88,14 +78,6 @@ def process_one(question): for response in tqdm(executor.map(process_one, ['你是谁'] * 500)): continue - def test_issue1324_illegal_topk(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', top_k=-1): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_k `-1` cannot be a negative integer.' - assert output.get('object') == 'error' - @pytest.mark.order(8) @pytest.mark.turbomind @@ -614,455 +596,3 @@ def test_logprobs_streaming(self): length = api_client.encode(response, add_bos=False)[1] assert outputList[-1].get('choices')[0].get('finish_reason') == 'length' assert length == 5 or length == 6 - - -@pytest.mark.order(8) -@pytest.mark.turbomind -@pytest.mark.pytorch -@pytest.mark.flaky(reruns=2) -class TestRestfulInterfaceChatInteractive: - - def test_return_info_with_prompt(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - - def test_return_info_with_messegae(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt=[{ - 'role': 'user', - 'content': 'Hi, pls intro yourself' - }], - temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - - def test_return_info_with_prompt_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', stream=True, temperature=0.01): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 1) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - - def test_return_info_with_messegae_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt=[{ - 'role': 'user', - 'content': 'Hi, pls intro yourself' - }], - stream=True, - temperature=0.01): - outputList.append(output) - - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 1) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - - def test_single_stopword(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Shanghai is', stop=' is', temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert ' is' not in output.get('text') - assert output.get('finish_reason') == 'stop' - - def test_single_stopword_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Shanghai is', stop=' is', stream=True, temperature=0.01): - outputList.append(output) - - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - assert ' to' not in outputList[index].get('text') - assert output.get('finish_reason') == 'stop' - - def test_array_stopwords(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Shanghai is', stop=[' is', '上海', ' to'], temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert ' is' not in output.get('text') - assert ' 上海' not in output.get('text') - assert ' to' not in output.get('text') - assert output.get('finish_reason') == 'stop' - - def test_array_stopwords_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - stop=[' is', '上海', ' to'], - stream=True, - temperature=0.01): - outputList.append(output) - - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - assert ' is' not in outputList[index].get('text') - assert '上海' not in outputList[index].get('text') - assert ' to' not in outputList[index].get('text') - assert output.get('finish_reason') == 'stop' - - def test_special_words(self): - message = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ - '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ - '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \ - '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' + \ - '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' + \ - '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),机器学习和数据科学(用于' + \ - '展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、JSON等格式的文件)。<|im_end|>\n' + \ - '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,计算曲线积分:$I=\\int_L' + \ - '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt=message, skip_special_tokens=False, temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert '<|action_start|><|interpreter|>' in output.get('text') - - for output in api_client.chat_interactive_v1(prompt=message, skip_special_tokens=True, temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert '<|action_start|><|interpreter|>' not in output.get('text') - - def test_minimum_repetition_penalty(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - repetition_penalty=0.1, - temperature=0.01, - request_output_len=512): - continue - assert_chat_interactive_batch_return(output) - assert get_repeat_times(output.get('text'), 'is a name') > 5 or get_repeat_times( - output.get('text'), 'Shanghai is') > 5 - - def test_minimum_repetition_penalty_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - repetition_penalty=0.1, - temperature=0.01, - stream=True, - request_output_len=512): - outputList.append(output) - - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - response = '' - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - response += outputList[index].get('text') - assert get_repeat_times(response, 'is a name') > 5 or get_repeat_times(response, 'Shanghai is') > 5 - - def test_repetition_penalty_bigger_than_1(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - repetition_penalty=1.2, - temperature=0.01, - request_output_len=512): - continue - assert_chat_interactive_batch_return(output) - - def test_repetition_penalty_bigger_than_1_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - repetition_penalty=1.2, - stream=True, - temperature=0.01, - request_output_len=512): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - - def test_multiple_rounds(self): - api_client = APIClient(BASE_URL) - history = 0 - session_id = random.randint(0, 100000) - for i in range(3): - for output in api_client.chat_interactive_v1(prompt='Shanghai is', - temperature=0.01, - interactive_mode=True, - session_id=session_id): - continue - assert_chat_interactive_batch_return(output) - assert output.get('history_tokens') == history - history += output.get('input_tokens') + output.get('tokens') - - def test_multiple_rounds_streaming(self): - api_client = APIClient(BASE_URL) - history = 0 - session_id = random.randint(0, 100000) - for i in range(3): - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', - stream=True, - temperature=0.01, - interactive_mode=True, - session_id=session_id): - outputList.append(output) - print(outputList) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - assert outputList[-1].get('history_tokens') == history - history += outputList[-1].get('input_tokens') + outputList[-1].get('tokens') - - def test_minimum_topp(self): - api_client = APIClient(BASE_URL) - outputList = [] - for i in range(3): - for output in api_client.chat_interactive_v1(prompt='Shanghai is', top_p=0.01, request_output_len=10): - continue - assert_chat_interactive_batch_return(output) - outputList.append(output) - assert outputList[0] == outputList[1] - assert outputList[1] == outputList[2] - - def test_minimum_topp_streaming(self): - api_client = APIClient(BASE_URL) - model_name = api_client.available_models[0] - responseList = [] - for i in range(3): - outputList = [] - response = '' - for output in api_client.chat_interactive_v1(model=model_name, - prompt='Hi, pls intro yourself', - stream=True, - top_p=0.01, - request_output_len=10): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - response += outputList[index].get('text') - responseList.append(response) - assert responseList[0] == responseList[1] or responseList[1] == responseList[2] - - def test_minimum_topk(self): - api_client = APIClient(BASE_URL) - outputList = [] - for i in range(3): - for output in api_client.chat_interactive_v1(prompt='Shanghai is', top_k=1, request_output_len=10): - continue - assert_chat_interactive_batch_return(output) - outputList.append(output) - assert outputList[0] == outputList[1] - assert outputList[1] == outputList[2] - - def test_minimum_topk_streaming(self): - api_client = APIClient(BASE_URL) - model_name = api_client.available_models[0] - responseList = [] - for i in range(3): - outputList = [] - response = '' - for output in api_client.chat_interactive_v1(model=model_name, - prompt='Hi, pls intro yourself', - stream=True, - top_k=1, - request_output_len=10): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - response += outputList[index].get('text') - responseList.append(response) - assert responseList[0] == responseList[1] - assert responseList[1] == responseList[2] - - def test_mutilple_times_response_should_not_same(self): - api_client = APIClient(BASE_URL) - outputList = [] - for i in range(3): - for output in api_client.chat_interactive_v1(prompt='Shanghai is', request_output_len=100): - continue - assert_chat_interactive_batch_return(output) - outputList.append(output) - assert outputList[0] != outputList[1] or outputList[1] != outputList[2] - - def test_mutilple_times_response_should_not_same_streaming(self): - api_client = APIClient(BASE_URL) - model_name = api_client.available_models[0] - responseList = [] - for i in range(3): - outputList = [] - response = '' - for output in api_client.chat_interactive_v1(model=model_name, - prompt='Hi, pls intro yourself', - stream=True, - request_output_len=100): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - response += outputList[index].get('text') - responseList.append(response) - assert responseList[0] != responseList[1] or responseList[1] != responseList[2] - - def test_longtext_input(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself' * 100000, temperature=0.01): - continue - assert output.get('finish_reason') == 'length' - assert output.get('text') == '' - - def test_longtext_input_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself' * 100000, - stream=True, - temperature=0.01): - outputList.append(output) - assert outputList[0].get('finish_reason') == 'length', outputList - assert outputList[0].get('text') == '' - assert len(outputList) == 1 - - def test_ignore_eos(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi, what is your name?', - ignore_eos=True, - request_output_len=100, - temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert output.get('tokens') == 100 or output.get('tokens') == 101 - assert output.get('finish_reason') == 'length' - - def test_ignore_eos_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Hi, what is your name?', - ignore_eos=True, - stream=True, - request_output_len=100, - temperature=0.01): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - assert output.get('finish_reason') == 'length' - assert outputList[-1].get('tokens') == 100 or outputList[-1].get('tokens') == 101 - - def test_max_tokens(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', - request_output_len=5, - temperature=0.01): - continue - assert_chat_interactive_batch_return(output) - assert output.get('finish_reason') == 'length' - assert output.get('tokens') == 5 or output.get('tokens') == 6 - - def test_max_tokens_streaming(self): - api_client = APIClient(BASE_URL) - outputList = [] - for output in api_client.chat_interactive_v1(prompt='Hi, pls intro yourself', - stream=True, - request_output_len=5, - temperature=0.01): - outputList.append(output) - assert_chat_interactive_stream_return(outputList[-1], True, index=len(outputList) - 2) - for index in range(0, len(outputList) - 1): - assert_chat_interactive_stream_return(outputList[index], index=index) - assert output.get('finish_reason') == 'length' - assert outputList[-1].get('tokens') == 5 or outputList[-1].get('tokens') == 6 - - def test_input_validation(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi', top_p=0): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_p `0.0` must be in (0, 1].' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', top_p=1.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_p `1.01` must be in (0, 1].' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', top_p='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid number' in str(output) - - for output in api_client.chat_interactive_v1(prompt='Hi', temperature=-0.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The temperature `-0.01` must be in [0, 2]' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', temperature=2.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The temperature `2.01` must be in [0, 2]' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', temperature='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid number' in str(output) - - for output in api_client.chat_interactive_v1(prompt='Hi', top_k=-1): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_k `-1` cannot be a negative integer.' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', top_k='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid integer' in str(output) - - def test_input_validation_streaming(self): - api_client = APIClient(BASE_URL) - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, top_p=0): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_p `0.0` must be in (0, 1].' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, top_p=1.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_p `1.01` must be in (0, 1].' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, top_p='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid number' in str(output) - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, temperature=-0.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The temperature `-0.01` must be in [0, 2]' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, temperature=2.01): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The temperature `2.01` must be in [0, 2]' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, temperature='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid number' in str(output) - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, top_k=-1): - continue - assert output.get('code') == 400 - assert output.get('message') == 'The top_k `-1` cannot be a negative integer.' - assert output.get('object') == 'error' - - for output in api_client.chat_interactive_v1(prompt='Hi', stream=True, top_k='test'): - continue - assert output.get('code') is None - assert 'Input should be a valid integer' in str(output) diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 82955c09aa..e00e68425b 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -1,7 +1,5 @@ import json import os -import random -import string import subprocess from time import sleep, time @@ -145,13 +143,6 @@ def run_all_step(config, cases_info, worker_id: str = '', port: int = DEFAULT_PO with assume: assert restful_result, msg - with allure.step(case + ' step3 - restful_test - interactive chat'): - active_result, interactive_log, msg = interactive_test(config, case, case_info, model, http_url, worker_id) - allure.attach.file(interactive_log, attachment_type=allure.attachment_type.TEXT) - - with assume: - assert active_result, msg - def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): log_path = config.get('log_path') @@ -190,47 +181,6 @@ def open_chat_test(config, case, case_info, model, url, worker_id: str = ''): return result, restful_log, msg -def interactive_test(config, case, case_info, model, url, worker_id: str = ''): - log_path = config.get('log_path') - - interactive_log = os.path.join(log_path, 'interactive_' + model + worker_id + '_' + case + '.log') - - file = open(interactive_log, 'w') - - result = True - - api_client = APIClient(url) - file.writelines('available_models:' + ','.join(api_client.available_models) + '\n') - - # Randomly generate 6 characters and concatenate them into a string. - characters = string.digits - random_chars = ''.join(random.choice(characters) for i in range(6)) - - messages = [] - msg = '' - for prompt_detail in case_info: - prompt = list(prompt_detail.keys())[0] - new_prompt = {'role': 'user', 'content': prompt} - messages.append(new_prompt) - file.writelines('prompt:' + prompt + '\n') - - for output in api_client.chat_interactive_v1(prompt=prompt, - interactive_mode=True, - session_id=random_chars, - top_k=1, - request_output_len=256): - output_content = output.get('text') - file.writelines('output:' + output_content + '\n') - - case_result, reason = assert_result(output_content, prompt_detail.values(), model) - file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') - if not case_result: - msg += reason - result = result & case_result - file.close() - return result, interactive_log, msg - - def health_check(url): try: api_client = APIClient(url) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 274ec2ff25..42cdc1f275 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -151,28 +151,6 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -As for `/v1/chat/interactive`,we disable the feature by default. Please open it by setting `interactive_mode = True`. If you don't, it falls back to openai compatible interfaces. - -Keep in mind that `session_id` indicates an identical sequence and all requests belonging to the same sequence must share the same `session_id`. -For instance, in a sequence with 10 rounds of chatting requests, the `session_id` in each request should be the same. - -```python -from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient(f'http://{server_ip}:{server_port}') -messages = [ - "hi, what's your name?", - "who developed you?", - "Tell me more about your developers", - "Summarize the information we've talked so far" -] -for message in messages: - for item in api_client.chat_interactive_v1(prompt=message, - session_id=1, - interactive_mode=True, - stream=False): - print(item) -``` - ### Tools May refer to [api_server_tools](./api_server_tools.md). diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index 8bb91c619e..4d4c999584 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -169,28 +169,6 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'): print(item) ``` -关于 `/v1/chat/interactive` 接口,我们默认是关闭的。在使用时,请设置`interactive_mode = True`打开它。否则,它会退化为 openai 接口。 - -在交互式推理中,每个对话序列的 id 必须唯一,所有属于该独立的对话请求,必须使用相同的 id。这里的 id 对应与接口中的 `session_id`。 -比如,一个对话序列中,有 10 轮对话请求,那么每轮对话请求中的 `session_id` 都要相同。 - -```python -from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient(f'http://{server_ip}:{server_port}') -messages = [ - "hi, what's your name?", - "who developed you?", - "Tell me more about your developers", - "Summarize the information we've talked so far" -] -for message in messages: - for item in api_client.chat_interactive_v1(prompt=message, - session_id=1, - interactive_mode=True, - stream=False): - print(item) -``` - ### 工具调用 参考 [api_server_tools](./api_server_tools.md)。 diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 486400d387..bcf8ccbffb 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -26,8 +26,8 @@ CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage, EmbeddingsRequest, EncodeRequest, EncodeResponse, ErrorResponse, - GenerateRequest, GenerateResponse, LogProbs, ModelCard, ModelList, - ModelPermission, TopLogprob, UsageInfo) + GenerateRequest, LogProbs, ModelCard, ModelList, ModelPermission, + TopLogprob, UsageInfo) from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager from lmdeploy.tokenizer import DetokenizeState, Tokenizer @@ -116,17 +116,17 @@ def create_error_response(status: HTTPStatus, message: str, error_type='invalid_ async def check_request(request) -> Optional[JSONResponse]: """Check if a request is valid.""" if hasattr(request, 'model') and request.model not in get_model_list(): - return create_error_response(HTTPStatus.NOT_FOUND, f'The model `{request.model}` does not exist.') + return create_error_response(HTTPStatus.NOT_FOUND, f'The model "{request.model}" does not exist.') if hasattr(request, 'n') and request.n <= 0: - return create_error_response(HTTPStatus.BAD_REQUEST, f'The n `{request.n}` must be a positive int.') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The n "{request.n}" must be a positive int.') if hasattr(request, 'top_p') and not (request.top_p > 0 and request.top_p <= 1): - return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p `{request.top_p}` must be in (0, 1].') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p "{request.top_p}" must be in (0, 1].') if hasattr(request, 'top_k') and request.top_k < 0: return create_error_response(HTTPStatus.BAD_REQUEST, - f'The top_k `{request.top_k}` cannot be a negative integer.') + f'The top_k "{request.top_k}" cannot be a negative integer.') if hasattr(request, 'temperature') and not (request.temperature <= 2 and request.temperature >= 0): return create_error_response(HTTPStatus.BAD_REQUEST, - f'The temperature `{request.temperature}` must be in [0, 2]') + f'The temperature "{request.temperature}" must be in [0, 2]') return @@ -330,7 +330,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque if error_check_ret is not None: return error_check_ret if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: - return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id} is occupied.') model_name = request.model adapter_name = None @@ -604,7 +604,7 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None if error_check_ret is not None: return error_check_ret if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0: - return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id} is occupied.') model_name = request.model adapter_name = None @@ -792,142 +792,8 @@ def encode(prompt: str, do_preprocess: bool, add_bos: bool): @router.post('/v1/chat/interactive', dependencies=[Depends(check_api_key)]) async def chat_interactive_v1(request: GenerateRequest, raw_request: Request = None): - """Generate completion for the request. - - - On interactive mode, the chat history is kept on the server. Please set - `interactive_mode = True`. - - On normal mode, no chat history is kept on the server. Set - `interactive_mode = False`. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - image_url(str | List[str] | None): the image url or base64 encoded string - for VL models. - - session_id: determine which instance will be called. If not specified - with a value other than -1, using random value directly. - - interactive_mode (bool): turn on interactive mode or not. On interactive - mode, session history is kept on the server (and vice versa). - - stream: whether to stream the results or not. - - stop (str | List[str] | None): To stop generating further - tokens. Only accept stop words that's encoded to one token idex. - - request_output_len (int): output token nums. If not specified, will use - maximum possible number for a session. - - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - - temperature (float): to modulate the next token probability - - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - - ignore_eos (bool): indicator for ignoring eos - - skip_special_tokens (bool): Whether or not to remove special tokens - in the decoding. Default to be True. - - spaces_between_special_tokens (bool): Whether or not to add spaces - around special tokens. The behavior of Fast tokenizers is to have - this to False. This is setup to True in slow tokenizers. - - adapter_name (str): For slora inference. Choose which lora to do the - inference. - - min_new_tokens (int): To generate at least numbers of tokens. - - min_p (float): Minimum token probability, which will be scaled by the - probability of the most likely token. It must be a value between - 0 and 1. Typical values are in the 0.01-0.2 range, comparably - selective as setting `top_p` in the 0.99-0.8 range (use the - opposite of normal `top_p` values) - """ - if request.cancel: - if request.session_id != -1: - await VariableInterface.async_engine.stop_session(request.session_id) - return {'text': '', 'tokens': 0, 'input_tokens': 0, 'history_tokens': 0, 'finish_reason': 'stop'} - else: - return create_error_response(HTTPStatus.BAD_REQUEST, 'please set a session_id to cancel a request') - error_check_ret = await check_request(request) - if error_check_ret is not None: - return error_check_ret - if request.session_id == -1: - VariableInterface.session_id += 1 - request.session_id = VariableInterface.session_id - - async_engine = VariableInterface.async_engine - sequence_start = async_engine.id2step.get(request.session_id, 0) == 0 - sequence_end = not request.interactive_mode - if isinstance(request.stop, str): - request.stop = [request.stop] - - end_session = sequence_end and request.prompt == '' and request.request_output_len == 0 - if end_session: - await async_engine.end_session(request.session_id) - return JSONResponse(dict(text='', tokens=0, input_tokens=0, history_tokens=0, finish_reason='stop')) - - random_seed = request.seed if request.seed else None - - gen_config = GenerationConfig(max_new_tokens=request.request_output_len, - do_sample=True, - top_p=request.top_p, - top_k=request.top_k, - temperature=request.temperature, - repetition_penalty=request.repetition_penalty, - ignore_eos=request.ignore_eos, - stop_words=request.stop, - skip_special_tokens=request.skip_special_tokens, - spaces_between_special_tokens=request.spaces_between_special_tokens, - min_new_tokens=request.min_new_tokens, - min_p=request.min_p, - random_seed=random_seed) - if request.image_url: - from lmdeploy.vl import load_image - if isinstance(request.image_url, List): - request.prompt = (request.prompt, [load_image(url) for url in request.image_url]) - else: - request.prompt = (request.prompt, load_image(request.image_url)) - if not hasattr(async_engine, '_convert_prompts'): - return create_error_response(HTTPStatus.BAD_REQUEST, '`image_url` argument only works for VL model') - request.prompt = async_engine._convert_prompts(request.prompt) - generation = async_engine.generate( - request.prompt, - request.session_id, - gen_config=gen_config, - stream_response=True, # always use stream to enable batching - sequence_start=sequence_start, - sequence_end=sequence_end, - adapter_name=request.adapter_name) - - # Streaming case - async def stream_results() -> AsyncGenerator[bytes, None]: - async for out in generation: - chunk = GenerateResponse(text=out.response, - tokens=out.generate_token_len, - input_tokens=out.input_token_len, - history_tokens=out.history_token_len, - finish_reason=out.finish_reason) - data = chunk.model_dump_json() - yield f'{data}\n' - - if request.stream: - return StreamingResponse(stream_results(), media_type='text/event-stream') - else: - ret = {} - text = '' - tokens, input_tokens, history_tokens = 0, 0, 0 - finish_reason = None - async for out in generation: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - await async_engine.stop_session(request.session_id) - return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') - text += out.response - tokens = out.generate_token_len - input_tokens = out.input_token_len - history_tokens = out.history_token_len - finish_reason = out.finish_reason - ret = { - 'text': text, - 'tokens': tokens, - 'input_tokens': input_tokens, - 'history_tokens': history_tokens, - 'finish_reason': finish_reason - } - return JSONResponse(ret) + return create_error_response(HTTPStatus.BAD_REQUEST, + 'v1/chat/interactive is removed, pleease use v1/chat/completions instead') def handle_torchrun(): @@ -1120,10 +986,6 @@ def serve(model_path: str, if proxy_url is not None: VariableInterface.proxy_url = proxy_url VariableInterface.api_server_url = f'{http_or_https}://{server_name}:{server_port}' # noqa - for i in range(3): - print(f'HINT: Please open \033[93m\033[1m{http_or_https}://' - f'{server_name}:{server_port}\033[0m in a browser for detailed api' - ' usage!!!') uvicorn.run(app=app, host=server_name, port=server_port, From 617d3172ac2ec6ea48870033bc241dc7b207fc14 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 11:23:44 +0800 Subject: [PATCH 18/34] fix build error on windows platform --- builder/windows/generate.ps1 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builder/windows/generate.ps1 b/builder/windows/generate.ps1 index 96dbbc70bd..2488288155 100644 --- a/builder/windows/generate.ps1 +++ b/builder/windows/generate.ps1 @@ -5,4 +5,5 @@ cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" ` -DBUILD_MULTI_GPU=OFF ` -DCMAKE_CUDA_FLAGS="-lineinfo" ` -DUSE_NVTX=ON ` - -DBUILD_TEST="$env:BUILD_TEST" + -DBUILD_TEST="$env:BUILD_TEST" ` + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 From 50e56e2389646fce2d7f584e6e09c4318570f287 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 12:31:10 +0800 Subject: [PATCH 19/34] fix chat --- lmdeploy/turbomind/chat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index aa2e38402f..fe371d05ca 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -155,7 +155,6 @@ def main(model_path: str, if user_input == 'exit': exit(0) elif user_input == 'end': - loop.run_until_complete(generator.async_end(session_id)) seed = random.getrandbits(64) messages = [] else: From 38ea2aee3c321615563318dc9c342b916e7c5c65 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 18:16:12 +0800 Subject: [PATCH 20/34] update generate.ps1 --- builder/windows/generate.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builder/windows/generate.ps1 b/builder/windows/generate.ps1 index 2488288155..f2a244b610 100644 --- a/builder/windows/generate.ps1 +++ b/builder/windows/generate.ps1 @@ -6,4 +6,4 @@ cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" ` -DCMAKE_CUDA_FLAGS="-lineinfo" ` -DUSE_NVTX=ON ` -DBUILD_TEST="$env:BUILD_TEST" ` - -DCMAKE_POLICY_VERSION_MINIMUM=3.5 + -DCMAKE_POLICY_VERSION_MINIMUM="3.5" From e1489a5a8eedc9341c59ca2d6a8585925d1d2af2 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 18:31:10 +0800 Subject: [PATCH 21/34] fix clang-format error --- lmdeploy/turbomind/turbomind.py | 12 +- src/turbomind/engine/gateway.h | 8 +- src/turbomind/engine/model_request.cc | 11 +- src/turbomind/engine/request_queue.h | 15 +-- src/turbomind/models/llama/BlockTrie.cc | 1 - src/turbomind/models/llama/LlamaBatch.cc | 109 ++++++------------ src/turbomind/models/llama/SequenceManager.cc | 31 ++--- src/turbomind/python/bind.cpp | 4 +- 8 files changed, 68 insertions(+), 123 deletions(-) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 73920ae00e..68a9a9f537 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -223,8 +223,7 @@ def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: Tu # pack `self.config` and `self.engine_config` into a dict self.config_dict = self.config.to_dict() self.config_dict.update(dict(engine_config=asdict(self.engine_config))) - logger.info(f'turbomind model config:\n\n' - f'{json.dumps(self.config_dict, indent=2)}') + logger.info(f'turbomind model config:\n\n{json.dumps(self.config_dict, indent=2)}') def _from_hf(self, model_source: ModelSource, model_path: str, engine_config: TurbomindEngineConfig): """Load model which is in hf format.""" @@ -526,15 +525,8 @@ def prepare_inputs(self, async def async_cancel(self, session_id: int = None): self.model_inst.cancel() - def async_end_cb(self, fut: asyncio.Future, status: int): - """executing on engine's signaling thread.""" - logger.info(f'[async_end_cb] session ended, status = {status}') - fut.get_loop().call_soon_threadsafe(fut.set_result, status) - async def async_end(self, session_id): - fut = asyncio.get_running_loop().create_future() - self.model_inst.end(partial(self.async_end_cb, fut), session_id) - await fut + pass def async_signal_cb(self, s: StreamingSemaphore): """executing on engine's signaling thread.""" diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h index dee7783c5e..0b7d05c25e 100644 --- a/src/turbomind/engine/gateway.h +++ b/src/turbomind/engine/gateway.h @@ -68,13 +68,7 @@ class Gateway { { int rank = -1; - // if (!r->session.start_flag) { - // // route to corresponding rank - // rank = seqid2rank_.find(r->session.id); - // } - // else { - rank = next_.fetch_add(1, std::memory_order_relaxed) % size_; - // } + rank = next_.fetch_add(1, std::memory_order_relaxed) % size_; if (rank >= 0) { queues_[rank]->push({std::move(r)}); diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index c1ded60c3b..b35568c770 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -123,13 +123,16 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); if (param.gen_cfg.output_logits) { - const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len - param.session.step : max_out_len; + const int len = + param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len - param.session.step : max_out_len; add(outputs_, "logits", TYPE_FP32, MEMORY_CPU, len, vocab_size_); TM_LOG_INFO("[ModelRequest][forward] ID %llu, output_logits len %d", param.session.id, len); } if (param.gen_cfg.output_last_hidden_state) { - const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len - param.session.step : max_out_len; + const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? + max_in_out_len - param.session.step : + max_out_len; add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_); TM_LOG_INFO("[ModelRequest][forward] ID %llu, output_last_hidden_state len %d", param.session.id, len); } @@ -151,9 +154,7 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output auto state = std::make_shared(); - // if (param.session.start_flag) { - session_id_ = param.session.id; - // } + session_id_ = param.session.id; r->id = param.session.id; r->session = param.session; diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h index a0740ab41e..3d60dbe664 100644 --- a/src/turbomind/engine/request_queue.h +++ b/src/turbomind/engine/request_queue.h @@ -47,16 +47,11 @@ class RequestQueue { auto it = queue_.begin(); int count{}; while (rs.size() < max_rs_size && count < max_count && it != queue_.end()) { - // if (!(*it)->session.start_flag) { - rs.push_back(std::move(*it)); - ++count; - auto tmp = it; - ++it; - queue_.erase(tmp); - // } - // else { - // ++it; - // } + rs.push_back(std::move(*it)); + ++count; + auto tmp = it; + ++it; + queue_.erase(tmp); } return count; diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index c620cbdd1e..97462ae7f0 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -54,7 +54,6 @@ std::tuple>> BlockTri std::tuple>> BlockTrie::Cache(const Sequence& seq, const std::vector& tokens) { - TM_LOG_INFO("[BlockTrie][cache] session %llu, seq.blocks %d, tokens %d", seq.id, seq.blocks.size(), tokens.size()); FT_CHECK(seq.status != Sequence::kCached); FT_CHECK(tokens.size() <= seq.blocks.size() * block_seq_len_); diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 085cd60c4b..4d06ee8b28 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -224,25 +224,6 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.step; - // if (s < 0) { - // s = ptr->tokens.size(); - // } - // else if (s > ptr->tokens.size()) { - // if (tp_rank_ == 0) { - // TM_LOG_WARNING("[ProcessInferRequests] Skipping invalid step (%d) setting for ID %lu", s, ptr->id); - // } - // s = ptr->tokens.size(); - // } - // return s; - // }(); - - // if (step + input_length > session_len_) { - // signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); - // continue; - // } - FT_CHECK(!state.requests[idx]); state.requests[idx] = r; @@ -358,36 +339,26 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag) { - seq.rope_theta = model_->attn_param_.rope.base; - if (model_->attn_param_.rope.type == RopeType::kDynamic) { - auto scaling_factor = model_->attn_param_.rope.factor; - if (scaling_factor >= 1.f) { // infer by current context length - auto max_seq_len = state.h_context_length[idx]; - auto max_pos_emb = model_->attn_param_.rope.max_position_embeddings; - if (max_seq_len > max_pos_emb) { - scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1); - float rope_dim = model_->attn_param_.rope.dim; - seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f)); - TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f", - (long)seq.id, - scaling_factor, - seq.rope_theta); - } + seq.rope_theta = model_->attn_param_.rope.base; + if (model_->attn_param_.rope.type == RopeType::kDynamic) { + auto scaling_factor = model_->attn_param_.rope.factor; + if (scaling_factor >= 1.f) { // infer by current context length + auto max_seq_len = state.h_context_length[idx]; + auto max_pos_emb = model_->attn_param_.rope.max_position_embeddings; + if (max_seq_len > max_pos_emb) { + scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1); + float rope_dim = model_->attn_param_.rope.dim; + seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f)); + TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f", + (long)seq.id, + scaling_factor, + seq.rope_theta); } } - // } + } state.h_rope_theta[idx] = seq.rope_theta; - // if (r->session.start_flag) { - // prepare to initialize random state for new sequence - h_random_seed_[idx] = r->gen_cfg.random_seed; - // } - // else { - // // Recover device states if not a new sequence - // h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data(); - // existing_idx.push_back(idx); - // } + h_random_seed_[idx] = r->gen_cfg.random_seed; // increment pointer idx++; @@ -1304,7 +1275,7 @@ void LlamaBatch::OutputLogits(const float* logits, int first, int last, Gener const int valid_len = input_len - std::max(0, diff); - TM_LOG_INFO("[output_logits] %d %d %d %d %d %d %d", + TM_LOG_DEBUG("[output_logits] %d %d %d %d %d %d %d", history_len, offset, cache_len, @@ -1511,9 +1482,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) for (int i = 0; i < batch_size - g.partial; ++i) { if (state_->h_finished[i]) { ++g.finished_count; - // if (!state_->requests[i]->session.end_flag) { - need_sync = true; - // } + need_sync = true; } } if (need_sync) { @@ -1558,10 +1527,7 @@ template auto LlamaBatch::Interrupt(int index, bool force_stop) -> Signal { if (tp_rank_ == 0) { - TM_LOG_INFO("[Interrupt] slot %d, request %lu, stop %d", - index, - (long)state_->requests[index]->id, - force_stop); + TM_LOG_INFO("[Interrupt] slot %d, request %llu, stop %d", index, state_->requests[index]->id, force_stop); } if (debug_ && tp_rank_ == 0) { @@ -1575,31 +1541,26 @@ auto LlamaBatch::Interrupt(int index, bool force_stop) -> Signal TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - // if (state_->requests[index]->session.end_flag || force_end) { - // // Sequence is ending this round or a stop request is issued to end it - // FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); - // } - // else { - const int output_len = state_->h_context_length[index]; - auto& seq = *state_->sequences[index]; - // Update token IDs - seq.tokens.resize(output_len); + const int output_len = state_->h_context_length[index]; + auto& seq = *state_->sequences[index]; + + // Update token IDs + seq.tokens.resize(output_len); - // output_ids is updated & synced in `Finish` - const auto output_ids = state_->requests[index]->output_ids.getPtr(); - std::copy_n(output_ids, output_len, seq.tokens.data()); - // Cache the generated tokens of the sequence - sequence_manager_->CacheGeneration(seq); + // output_ids is updated & synced in `Finish` + const auto output_ids = state_->requests[index]->output_ids.getPtr(); + std::copy_n(output_ids, output_len, seq.tokens.data()); + // Cache the generated tokens of the sequence + sequence_manager_->CacheGeneration(seq); - // Save random state in host memory - seq.random_state.resize(sizeof(curandState_t)); - // This async copy must be synchronized by the caller - Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data()); + // Save random state in host memory + seq.random_state.resize(sizeof(curandState_t)); + // This async copy must be synchronized by the caller + Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data()); - // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()` - sequence_manager_->UpdateAndSetUnlock(seq); - // } + // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()` + sequence_manager_->UpdateAndSetUnlock(seq); state_->sequences[index] = nullptr; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 215a205d69..8db0997858 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -51,6 +51,7 @@ SequenceManager::SequenceManager(size_t layer_num, if (enable_prefix_caching) { block_trie_ = std::make_shared(block_config.block_len_); } + TM_LOG_WARNING("[SegMgr] prefix caching is %s", enable_prefix_caching ? "enabled" : "disabled"); } const Sequence* SequenceManager::Create(uint64_t id) @@ -87,7 +88,7 @@ const Sequence* SequenceManager::Get(uint64_t id) if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][Get] Reuse ID %llu, reset the mutable variables of the sequence", id); } - auto &seq = it->second; + auto& seq = it->second; seq.prompt.clear(); seq.tokens.clear(); seq.cache_len = 0; @@ -163,10 +164,10 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) block_ids.size(), seq.prompt.size(), valid); - TM_LOG_INFO("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, unique_ids %s", - seq.id, - vector2string(block_ids).c_str(), - vector2string(block_unique_ids).c_str()); + TM_LOG_DEBUG("[SeqMgr][CachePrompt] ID %llu, cached block_ids %s, unique_ids %s", + seq.id, + vector2string(block_ids).c_str(), + vector2string(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -191,10 +192,10 @@ void SequenceManager::CacheGeneration(const Sequence& seq) block_ids.size(), seq.tokens.size(), valid); - TM_LOG_INFO("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", - seq.id, - vector2string(block_ids).c_str(), - vector2string(block_unique_ids).c_str()); + TM_LOG_DEBUG("[SeqMgr][CacheGeneration] ID %llu, cached block_ids %s, unique_ids %s", + seq.id, + vector2string(block_ids).c_str(), + vector2string(block_unique_ids).c_str()); } // remove invalid nodes from trie tree if there is any if (valid < block_ids.size()) { @@ -498,16 +499,18 @@ void SequenceManager::PrefixMatch(Sequences& sequences) // seq.cache_len == 0 but seq.blocks is not empty. It means the new seq reuses an older seq's ID // So we should UNLOCK the unmatched blocks and reset seq.blocks as matched_blockes BlockIds unmatched_ids; - std::set_difference(seq.blocks.begin(), seq.blocks.end(), matched_ids.begin(), matched_ids.end(), + std::set_difference(seq.blocks.begin(), + seq.blocks.end(), + matched_ids.begin(), + matched_ids.end(), std::inserter(unmatched_ids, unmatched_ids.begin())); block_manager_->Unlock(unmatched_ids); seq.blocks.clear(); seq.block_unique_ids.clear(); if (rank_ == 0) { TM_LOG_INFO("[SegMgr][match] ID %llu, unlock unmatched blocks %d", seq.id, unmatched_ids.size()); - TM_LOG_INFO("[SegMgr][match] ID %llu, unmatched block_ids %s", - seq.id, - vector2string(unmatched_ids).c_str()); + TM_LOG_DEBUG( + "[SegMgr][match] ID %llu, unmatched block_ids %s", seq.id, vector2string(unmatched_ids).c_str()); } } seq.cache_len = valid * block_seq_len_; @@ -518,7 +521,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.id, seq.blocks.size(), seq.cache_len); - TM_LOG_INFO("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", seq.id, vector2string(seq.blocks).c_str(), vector2string(seq.block_unique_ids).c_str()); diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index dbd69fcaf7..bb2ac61c3f 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -304,8 +304,8 @@ PYBIND11_MODULE(_turbomind, m) py::class_(m, "SessionParam") .def(py::init([](uint64_t id, int step) { ft::SessionParam param{}; - param.id = id; - param.step = step; + param.id = id; + param.step = step; return param; }), "id"_a, From 9d1df280b6d318424e851013a7961befbcdfb01a Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 3 Apr 2025 18:34:05 +0800 Subject: [PATCH 22/34] fix clang-format error --- src/turbomind/models/llama/LlamaBatch.cc | 1 - src/turbomind/models/llama/SequenceManager.cc | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 4d06ee8b28..a5e790b69d 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -1541,7 +1541,6 @@ auto LlamaBatch::Interrupt(int index, bool force_stop) -> Signal TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str()); } - const int output_len = state_->h_context_length[index]; auto& seq = *state_->sequences[index]; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 8db0997858..5182d8f2b9 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -522,9 +522,9 @@ void SequenceManager::PrefixMatch(Sequences& sequences) seq.blocks.size(), seq.cache_len); TM_LOG_DEBUG("[SeqMgr][match] ID %llu, after matching, block_ids %s, unique_ids %s", - seq.id, - vector2string(seq.blocks).c_str(), - vector2string(seq.block_unique_ids).c_str()); + seq.id, + vector2string(seq.blocks).c_str(), + vector2string(seq.block_unique_ids).c_str()); } } } From e2a0c7abde78e7c3ed4fbf7596b6a2022f88a143 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 4 Apr 2025 12:20:45 +0800 Subject: [PATCH 23/34] fix vlm chat error --- lmdeploy/serve/async_engine.py | 61 ++++++++++++------------------- lmdeploy/serve/vl_async_engine.py | 4 -- 2 files changed, 24 insertions(+), 41 deletions(-) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 195a366529..15b866c8e8 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -12,7 +12,7 @@ from itertools import count from queue import Queue from threading import Thread -from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional, Tuple, Union +from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional, Union import torch import tqdm @@ -93,30 +93,19 @@ class Session: """Session for AsyncEngine.chat. Args: - _id (int): session_id for internal use. - _step (int): the offset of the k/v cache for internal use. - _prompt (Any): input prompt for internal use. - _response (Reaponse): model output for prompt. - _engine (Any): engine for internal use. - history (List[Any, str]): chat history. + _id (int): session_id for internal use + _engine (Any): engine for internal use + _response (Reaponse): model output for prompt + _gen_config (GenerationConfig): the generation config + messages (List[Dict]): chat history in openai format """ def __init__(self, session_id: int, engine: Any, gen_config: GenerationConfig = None): self._id: int = session_id self._engine = engine - self._step: int = 0 - self._prompt: Any = None self._response: Response = None self._gen_config = gen_config - self.history: List[Tuple[Any, str]] = [] - - def _merge_response(self, resp: Response, step: Union[Response, GenOut]): - """merge response.""" - resp.text += step.text if isinstance(step, Response) else step.response - resp.input_token_len = step.input_token_len - resp.generate_token_len = step.generate_token_len - resp.finish_reason = step.finish_reason - return resp + self.messages: List[Dict] = [] @property def response(self) -> Response: @@ -128,14 +117,7 @@ def close(self): if self._engine: self._engine._run(coro=self._engine.end_session(self._id)).result() self._engine = None - - def __repr__(self) -> str: - res = '' - for user, assistant in self.history: - if isinstance(user, list): - user = str(user) - res += f'USER:\n{user}\nASSISTANT:\n{assistant}\n' - return res + self.messages = [] def __enter__(self): return self @@ -821,7 +803,7 @@ def session(self, gen_config: GenerationConfig = None): return Session(self._run(fn=lambda: next(self._session_id)).result(), engine=self, gen_config=gen_config) def chat(self, - prompt: str, + prompt: Union[List[Dict], str], session=None, gen_config: Optional[GenerationConfig] = None, stream_response=False, @@ -829,7 +811,7 @@ def chat(self, """Chat. Args: - prompt (str): prompt + prompt (Union[List[Dict], str]): it can be an openai-like message or a string session (Session): the chat session gen_config (GenerationConfig | None): a instance of GenerationConfig. Default to None. @@ -840,16 +822,19 @@ def chat(self, if session is None: session = self.session() - # sync & init - session._prompt = prompt - session._response = None + if isinstance(prompt, str): + session.messages.append(dict(role='user', content=prompt)) + elif isinstance(prompt, List) and all(isinstance(_, Dict) for _ in prompt): + session.messages.extend(prompt) + else: + raise ValueError(f'unsupported prompt: {prompt}') - sequence_start = session._step == 0 + session._response = None - generator = self.infer(prompt, + generator = self.infer(session.messages, gen_config, - sequence_start=sequence_start, - sequence_end=False, + sequence_start=True, + sequence_end=True, session_id=session._id, stream_response=stream_response, multiplex=True) @@ -865,8 +850,10 @@ def _gen(): raise else: session._response = resp - session._step += resp.generate_token_len + resp.input_token_len - session.history.append((session._prompt, resp.text)) + session.messages.append(dict(role='user', content=resp.text)) + # Since prefix caching is used to substitute interactive mode, the context step should be + # reset after each round + self.id2step[session._id] = 0 if stream_response: session.generator = _gen() diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py index 5bf227b661..b45ce1e92e 100644 --- a/lmdeploy/serve/vl_async_engine.py +++ b/lmdeploy/serve/vl_async_engine.py @@ -211,10 +211,6 @@ def chat(self, prompts: VLPromptType, *args, **kwargs): _prompts = self._convert_prompts(prompts) sess = super().chat(_prompts, *args, **kwargs) - # recover prompts & history - sess._prompt = prompts - last_round = sess.history[-1] - sess.history[-1] = (prompts, last_round[-1]) return sess @classmethod From 5e34425627b2189e722814205500183f1dd6e40b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 4 Apr 2025 16:12:41 +0800 Subject: [PATCH 24/34] fix get_logits --- lmdeploy/serve/utils.py | 146 ++++++++++-------- src/turbomind/models/llama/SequenceManager.cc | 2 +- 2 files changed, 84 insertions(+), 64 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 4b04761f80..98921118e0 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -45,39 +45,85 @@ def get_reward_score(self, input_ids: List) -> List[float]: async def _async_get_logits(self, input_ids, steps: List[int] = None, + max_input_len: int = None, sequence_start: bool = True, sequence_end: bool = True) -> List[torch.Tensor]: assert input_ids and all(isinstance(_, List) for _ in input_ids) assert steps is None or (len(steps) == len(input_ids)) + steps = steps or [0] * len(input_ids) + max_input_len = max_input_len or max([len(x) for x in input_ids]) + + if self.backend == 'turbomind': + logits = await self._async_get_logits_by_turbomind(input_ids, steps, max_input_len) + else: + logits = await self._async_get_logits_by_pytorch(input_ids, steps, max_input_len, sequence_start, + sequence_end) + return logits + + async def _async_get_logits_by_turbomind(self, input_ids, steps, max_input_len): + assert len(input_ids) == len(steps) + + if any(s != 0 for s in steps): + assert self.backend_config.enable_prefix_caching, 'please enable prefix caching' + assert all(s % self.backend_config.cache_block_seq_len == 0 for s in steps) + logits = [None] * len(input_ids) + gen_config = GenerationConfig(max_new_tokens=1, output_logits='all', do_sample=False) async def _proc(i): async with self.model_inst(session_id=i) as inst: - input_len = len(input_ids[i]) - # TODO(lvhan): Fix the ugly code later on - max_new_tokens = 1 if self.backend == 'turbomind' else 0 + token_ids = input_ids[i][:steps[i] + max_input_len] + input_len = len(token_ids) + async with self.safe_run(inst, + session_id=i, + input_ids=token_ids, + gen_config=gen_config, + stream_output=False, + step=steps[i]) as gen: + async for outputs in gen: + pass + logits[i] = outputs.logits[:input_len - steps[i], :] + + tasks = [_proc(i) for i in range(len(input_ids))] + await asyncio.gather(*tasks) + + return logits + + async def _async_get_logits_by_pytorch(self, + input_ids: List[List[int]], + steps: List[int], + max_input_len: int, + sequence_start: bool = True, + sequence_end: bool = True): + logits = [None] * len(input_ids) + + async def _proc(i): + async with self.model_inst(session_id=i) as inst: + token_ids = input_ids[i][steps[i]:steps[i] + max_input_len] + input_len = len(token_ids) # The reason to set `top_k=1` is that pt engine crashes at top_k sampling stage # when perform inference on a reward model. - gen_config = GenerationConfig(max_new_tokens=max_new_tokens, output_logits='all', top_k=1) + gen_config = GenerationConfig(max_new_tokens=0, output_logits='all', top_k=1) async with self.safe_run(inst, session_id=i, - input_ids=input_ids[i], + input_ids=token_ids, gen_config=gen_config, stream_output=False, sequence_start=sequence_start, sequence_end=sequence_end, - step=steps[i] if steps else 0) as gen: + step=steps[i]) as gen: async for outputs in gen: pass - logits[i] = outputs.logits[:input_len - steps[i], :] - logger.info(f'logits[{i}].shape: {logits[i].shape}, input_len: {input_len}, step: {steps[i]}') + logits[i] = outputs.logits[:input_len, :] session_ids = list(range(len(input_ids))) tasks = [_proc(i) for i in range(len(input_ids))] await asyncio.gather(*tasks) - - return logits, session_ids + if sequence_end: + for i in session_ids: + await self.end_session(i) + return logits def get_ppl(self, input_ids: Union[List[int], List[List[int]]]) -> List[float]: """Get perplexity scores given a list of input tokens that have to be @@ -95,10 +141,7 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]) -> List[float]: input_ids = [input_ids] assert all(len(_) > 1 for _ in input_ids) - # TODO: a better way to determine `max_input_len`, at most allocate - # 2G mem for logits with shape [bs, max_input_len, vocab_size] - vocab_size = self.hf_tm_cfg.vocab_size - max_input_len = 2 * 1024**3 // (vocab_size * 4) + max_input_len = self.backend_config.max_prefill_token_num sizes = [len(_) for _ in input_ids] result = [] sorted_index_values = sorted(list(enumerate(sizes)), key=lambda x: x[1], reverse=True) @@ -110,17 +153,13 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]) -> List[float]: logger.info(f'start: {start}, end: {end}') if start == end: _input_ids = input_ids[indices[start]] - res, session_ids = self._get_long_text_ppl(input_ids=_input_ids, max_input_len=max_input_len) + res = self._get_long_text_ppl(input_ids=_input_ids, max_input_len=max_input_len) result.append(res) else: _input_ids = [input_ids[indices[i]] for i in range(start, end)] - res, session_ids = self._get_ppl( - input_ids=_input_ids, - max_input_len=max_input_len, - ) + steps = [0] * len(_input_ids) + res, _ = self._get_ppl(input_ids=_input_ids, steps=steps, max_input_len=max_input_len) result.extend(res) - for session_id in session_ids: - self.end_session(session_id) output = list(range(len(result))) for index, sorted_index in enumerate(indices): output[sorted_index] = result[index] @@ -156,57 +195,37 @@ def _get_long_text_ppl(self, input_ids, max_input_len): losses = [] target_counts = [] - session_ids = [] for i in range(0, seq_len, max_input_len): - token_ids = input_ids[i:i + max_input_len] - step = [i] - # shift token_ids by 1 to the left - target_ids = input_ids[i + 1:i + 1 + max_input_len] - loss, session_ids = self._get_ppl(input_ids=[token_ids], - max_input_len=len(token_ids), - target_ids=[target_ids], - steps=step, - sequence_start=(i == 0), - sequence_end=False) + loss, target_count = self._get_ppl(input_ids=[input_ids], + steps=[i], + max_input_len=max_input_len, + sequence_start=(i == 0), + sequence_end=False) losses.extend(loss) - target_counts.append(len(target_ids)) + target_counts.extend(target_count) losses = [loss * target_count for loss, target_count in zip(losses, target_counts)] loss_sum = sum(losses) target_count = sum(target_counts) - return loss_sum / target_count, session_ids - - def _get_ppl(self, - input_ids, - max_input_len, - target_ids=None, - steps=None, - sequence_start: bool = True, - sequence_end: bool = True): - assert (isinstance(input_ids, List) and all(isinstance(_, List) for _ in input_ids)) - assert steps is None or len(steps) == len(input_ids) - assert target_ids is None or len(target_ids) == len(input_ids) - - lens = [len(_) for _ in input_ids] - total_len = sum(lens) - assert sum(lens) <= max_input_len - - logger.info(f'get_ppl: bs: {len(input_ids)}, lens: {lens}, ' - f'total_len: {total_len}, steps: {steps}') + return loss_sum / target_count + + def _get_ppl(self, input_ids, steps, max_input_len, sequence_start: bool = True, sequence_end: bool = True): + assert isinstance(steps, List) and len(steps) == len(input_ids) + torch.cuda.empty_cache() - logits, session_ids = self._run(coro=self._async_get_logits( - input_ids=input_ids, steps=steps, sequence_start=sequence_start, sequence_end=sequence_end)).result() + logits = self._run(coro=self._async_get_logits(input_ids=input_ids, + steps=steps, + max_input_len=max_input_len, + sequence_start=sequence_start, + sequence_end=sequence_end)).result() padding_token_id = -100 - if target_ids is None: - target_ids = [x[1:] + [padding_token_id] for x in input_ids] - else: - target_ids = [ - target_ids[i] + [padding_token_id] if len(target_ids[i]) < len(input_ids[i]) else target_ids[i] - for i in range(len(input_ids)) - ] - target_ids = [torch.Tensor(torch.LongTensor(_target_ids)) for _target_ids in target_ids] + # shift token_ids by 1 to the left + target_ids = [s[steps[i] + 1:steps[i] + 1 + max_input_len] for i, s in enumerate(input_ids)] + target_ids = [t + [padding_token_id] if len(t) < max_input_len else t for t in target_ids] + target_ids = [torch.Tensor(torch.LongTensor(t)) for t in target_ids] result = [] + target_counts = [] for _logits, _target_ids in zip(logits, target_ids): _logits = _logits.float() vocab_size = _logits.shape[-1] @@ -222,5 +241,6 @@ def _get_ppl(self, loss = flat_loss_matrix.sum() target_count = target_mask.sum() result.append(loss.item() / target_count.item()) + target_counts.append(target_count) logger.info(f'ppl result: {result}') - return result, session_ids + return result, target_counts diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 5182d8f2b9..f29fa197ff 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -489,7 +489,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) // block_manager_->Touch(matched_ids); if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); - TM_LOG_INFO("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", + TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", seq.id, vector2string(block_ids).c_str(), vector2string(unique_ids).c_str()); From 1cbdf5afdcba06c1dece39c50a48156e85919db6 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 4 Apr 2025 20:43:28 +0800 Subject: [PATCH 25/34] remove killing from tm csrc --- lmdeploy/serve/utils.py | 2 -- src/turbomind/engine/gateway.h | 25 +----------------- src/turbomind/engine/model_request.cc | 12 --------- src/turbomind/engine/model_request.h | 3 --- src/turbomind/engine/request.h | 4 --- src/turbomind/engine/request_queue.h | 22 ++-------------- src/turbomind/models/llama/LlamaBatch.cc | 32 +++--------------------- src/turbomind/models/llama/LlamaBatch.h | 4 +-- src/turbomind/python/bind.cpp | 10 +------- 9 files changed, 8 insertions(+), 106 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 98921118e0..1fb5997222 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -38,8 +38,6 @@ def get_reward_score(self, input_ids: List) -> List[float]: logits, session_ids = self._run(coro=self._async_get_logits(input_ids=input_ids)).result() logits = [x.squeeze() for x in logits] scores = [x[-1].cpu().item() for x in logits] - for session_id in session_ids: - self.end_session(session_id) return scores async def _async_get_logits(self, diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h index 0b7d05c25e..3508fec789 100644 --- a/src/turbomind/engine/gateway.h +++ b/src/turbomind/engine/gateway.h @@ -82,14 +82,12 @@ class Gateway { } void pop(std::vector>& infer_reqs, - std::vector>& kill_reqs, unsigned max_infer, bool blocking, bool& abort, int rank) { infer_reqs.clear(); - kill_reqs.clear(); [&] { for (int i = 0; i < size_; ++i) { @@ -104,7 +102,7 @@ class Gateway { blocking = blocking && infer_reqs.empty(); - if (queues_[rank]->pop(infer_reqs, kill_reqs, max_infer, blocking, abort)) { + if (queues_[rank]->pop(infer_reqs, max_infer, blocking, abort)) { const int group_id = rank / group_size_; // Wake all siblings for (int i = group_id * group_size_; i < (group_id + 1) * group_size_; ++i) { @@ -132,14 +130,6 @@ class Gateway { seqid2rank_.bind(bind_ids, rank); } - // Unbind for stateful kill - std::vector unbind_ids; - for (const auto& r : kill_reqs) { - unbind_ids.push_back(r->session.id); - } - if (!unbind_ids.empty()) { - seqid2rank_.unbind(unbind_ids, rank); - } } void cancel(std::shared_ptr r) @@ -155,19 +145,6 @@ class Gateway { } } - void kill(std::shared_ptr r) - { - if (auto rank = seqid2rank_.find(r->session.id); rank >= 0) { - queues_[rank]->kill(std::move(r)); - } - else { - TM_LOG_ERROR("[Gateway] Failed to find a binded queue for %lu", r->session.id); - notify({[r = std::move(r)] { // - UpdateState(*r, Request::kInvalid, 0); - }}); - } - } - void notify(std::vector signals) { return signal_buffer_.push(std::move(signals)); diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index ccada742e2..7896db0444 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -71,18 +71,6 @@ void ModelRequest::Cancel() } } -void ModelRequest::End(std::function cb, uint64_t session_id) -{ - auto r = std::make_shared(); - - r->id = r->session.id = session_id; - r->session.kill_flag = true; - - r->end_cb = std::move(cb); - - gateway_->kill(std::move(r)); -} - auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam { inputs_ = std::make_shared(); diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index aea889e856..1a0287d20c 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -18,9 +18,6 @@ class ModelRequest { // Cancel running request void Cancel(); - // Reset the channel to uninitailized state, calls `notify` when done - void End(std::function cb, uint64_t session_id); - using TensorMap_ = std::unordered_map; struct InputParam { diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index 33d4e7bc99..17ab3139f2 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -81,10 +81,6 @@ struct SessionParam { uint64_t id; int step; - - // bool start_flag; - // bool end_flag; - bool kill_flag; }; struct RequestState { diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h index 3d60dbe664..46890d79d4 100644 --- a/src/turbomind/engine/request_queue.h +++ b/src/turbomind/engine/request_queue.h @@ -28,18 +28,6 @@ class RequestQueue { cv_.notify_one(); } - void kill(std::shared_ptr r) - { - { - std::lock_guard lock{mutex_}; - if (closed_) { - throw std::runtime_error("Queue is clsoed"); - } - kill_.push_back(std::move(r)); - } - cv_.notify_one(); - } - int try_pop(std::vector>& rs, int max_rs_size, int max_count) { std::lock_guard lock{mutex_}; @@ -58,7 +46,6 @@ class RequestQueue { } bool pop(std::vector>& infer_reqs, - std::vector>& kill_reqs, unsigned max_infer, bool blocking, bool& abort) @@ -69,8 +56,8 @@ class RequestQueue { if (blocking) { cv_.wait(lock, [this] { - return !(queue_.empty() && kill_.empty()) // - || flag_->load(std::memory_order_relaxed) == expected_ // + return !(queue_.empty()) + || flag_->load(std::memory_order_relaxed) == expected_ || closed_; }); if (closed_) { @@ -93,9 +80,6 @@ class RequestQueue { queue_.pop_front(); } - kill_reqs.insert(kill_reqs.end(), kill_.begin(), kill_.end()); - kill_.clear(); - return is_first; } @@ -129,8 +113,6 @@ class RequestQueue { std::pmr::list> queue_; std::pmr::unsynchronized_pool_resource pool_; - std::vector> kill_; - std::mutex mutex_; std::condition_variable cv_; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index 7fa6a47cef..a6b9ae8e02 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -94,7 +94,7 @@ void DropEmbeddings(const Sequence& seq) } template -void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs) +void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs) { NvtxScope _("disable invalid"); @@ -123,10 +123,7 @@ void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_ } } - count(kill_reqs); count(infer_reqs); - - validate(kill_reqs, "kill"); validate(infer_reqs, "infer"); // New requests that never get a chance to start @@ -168,26 +165,6 @@ void LlamaBatch::ProcessCancelRequests(std::vector& indices, std::vector } } -template -void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector& signals) -{ - for (auto& r : kill_reqs) { - if (r) { - int ec = r->ec; - if (!ec) { - if (!sequence_manager_->Erase(r->id)) { - ec = Request::kInvalid; - } - } - signals.push_back([=] { - if (r->end_cb) { - r->end_cb(ec); - } - }); - } - } -} - template void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector& signals) { @@ -1575,7 +1552,6 @@ namespace { struct RequestData { std::vector> infer; // incoming inference request - std::vector> kill; // incoming kill request std::vector cancel; // canceled indices in current batch bool abort; @@ -1605,10 +1581,10 @@ void LlamaBatch::InternalThreadEntry() const int free_slot_count = max_batch_size_ - state_->size + g.finished_count; const bool is_empty = (free_slot_count == max_batch_size_); // Block if batch is empty AND no silbings are ready - gateway_->pop(req->infer, req->kill, free_slot_count, is_empty, req->abort, dp_rank_); + gateway_->pop(req->infer, free_slot_count, is_empty, req->abort, dp_rank_); } // Mark reqs to the same session_id as invalid (which are dangerous to the engine) - DisableInvalidRequests(req->infer, req->kill); + DisableInvalidRequests(req->infer); FindCanceledIndices(req->cancel); } @@ -1628,8 +1604,6 @@ void LlamaBatch::InternalThreadEntry() std::vector signals; - ProcessKillRequests(req->kill, signals); - // Shared `priority` field will be assigned by rank-0 ProcessInferRequests(req->infer, signals); diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 9f4a6546b2..0aff2c65d5 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -76,9 +76,7 @@ class LlamaBatch { using Requests = std::vector>; using Signal = std::function; - void DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs); - - void ProcessKillRequests(const Requests& reqs, std::vector& signals); + void DisableInvalidRequests(Requests& infer_reqs); void ProcessInferRequests(const Requests& reqs, std::vector& signals); diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index bb2ac61c3f..947c4248f4 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -487,15 +487,7 @@ PYBIND11_MODULE(_turbomind, m) [](ModelRequest* model_request) { model_request->Cancel(); // }, - py::call_guard()) - .def( - "end", - [](ModelRequest* model_request, std::function cb, uint64_t session_id) { - model_request->End(std::move(cb), session_id); // - }, - py::call_guard(), - "cb"_a, - "session_id"_a); + py::call_guard()); // transformer model using ft::AbstractTransformerModel; From afd531daa1fa70337df8165835541b71cbcc497f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Sun, 6 Apr 2025 17:38:43 +0800 Subject: [PATCH 26/34] fix clang-format --- src/turbomind/engine/gateway.h | 8 ++------ src/turbomind/engine/request_queue.h | 9 ++------- src/turbomind/models/llama/SequenceManager.cc | 6 +++--- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/turbomind/engine/gateway.h b/src/turbomind/engine/gateway.h index 3508fec789..bb3295162d 100644 --- a/src/turbomind/engine/gateway.h +++ b/src/turbomind/engine/gateway.h @@ -81,11 +81,8 @@ class Gateway { } } - void pop(std::vector>& infer_reqs, - unsigned max_infer, - bool blocking, - bool& abort, - int rank) + void + pop(std::vector>& infer_reqs, unsigned max_infer, bool blocking, bool& abort, int rank) { infer_reqs.clear(); @@ -129,7 +126,6 @@ class Gateway { if (!bind_ids.empty()) { seqid2rank_.bind(bind_ids, rank); } - } void cancel(std::shared_ptr r) diff --git a/src/turbomind/engine/request_queue.h b/src/turbomind/engine/request_queue.h index 46890d79d4..c62a756d71 100644 --- a/src/turbomind/engine/request_queue.h +++ b/src/turbomind/engine/request_queue.h @@ -45,10 +45,7 @@ class RequestQueue { return count; } - bool pop(std::vector>& infer_reqs, - unsigned max_infer, - bool blocking, - bool& abort) + bool pop(std::vector>& infer_reqs, unsigned max_infer, bool blocking, bool& abort) { std::unique_lock lock{mutex_}; @@ -56,9 +53,7 @@ class RequestQueue { if (blocking) { cv_.wait(lock, [this] { - return !(queue_.empty()) - || flag_->load(std::memory_order_relaxed) == expected_ - || closed_; + return !(queue_.empty()) || flag_->load(std::memory_order_relaxed) == expected_ || closed_; }); if (closed_) { abort = true; diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index f29fa197ff..a3e1f010ab 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -490,9 +490,9 @@ void SequenceManager::PrefixMatch(Sequences& sequences) if (rank_ == 0) { TM_LOG_INFO("[SeqMgr][match] ID %llu, hit blocks %d, cache_len %d", seq.id, valid, seq.cache_len); TM_LOG_DEBUG("[SeqMgr][match] ID %llu, hit block_ids %s, unique_ids %s", - seq.id, - vector2string(block_ids).c_str(), - vector2string(unique_ids).c_str()); + seq.id, + vector2string(block_ids).c_str(), + vector2string(unique_ids).c_str()); } if (!seq.blocks.empty()) { From 3dc9ffabb5e44f365fe3c1750d303f5369bf9045 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 7 Apr 2025 20:27:58 +0800 Subject: [PATCH 27/34] update --- src/turbomind/engine/model_request.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index 7896db0444..80107ad5f2 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -99,9 +99,9 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` // is used instead const int max_seq_len = session_len_ + 1; - const int max_out_len = std::min(output_len, session_len_); + const int max_out_len = std::min(output_len, session_len_) + 1; // This does not include history length in interactive mode - const int max_in_out_len = std::min(input_len + output_len, session_len_); + const int max_in_out_len = std::min(input_len + output_len, session_len_) + 1; for (auto& [k, v] : *param.tensors) { inputs_->emplace(k, v); From 14eb22aa73731e02780715625c34ce1b94b9d3a5 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 7 Apr 2025 22:05:17 +0800 Subject: [PATCH 28/34] enable_prefix_caching defaults to True --- benchmark/profile_generation.py | 6 +++--- benchmark/profile_pipeline_api.py | 6 +++--- benchmark/profile_throughput.py | 6 +++--- lmdeploy/cli/cli.py | 4 ++-- lmdeploy/cli/serve.py | 12 ++++++------ lmdeploy/cli/utils.py | 6 +++--- lmdeploy/messages.py | 6 +++--- lmdeploy/pytorch/config.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index 3efeaa05b3..90fb54b265 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -324,7 +324,7 @@ def parse_args(): cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group) cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) session_len_act = ArgumentHelper.session_len(pt_group, default=2048) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group) dtype_act = ArgumentHelper.dtype(pt_group) @@ -390,7 +390,7 @@ def main(): session_len=session_len, rope_scaling_factor=args.rope_scaling_factor, tp=args.tp, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, dtype=args.dtype, ) elif args.backend == 'pytorch': @@ -400,7 +400,7 @@ def main(): session_len=session_len, tp=args.tp, eager_mode=args.eager_mode, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, dtype=args.dtype, ) gen_config = GenerationConfig(top_k=args.top_k, diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index ab7c7b8495..d7ec874569 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -154,7 +154,7 @@ def parse_args(): session_len_act = ArgumentHelper.session_len(pt_group, default=4096) cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group) cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) # turbomind engine args tb_group = parser.add_argument_group('TurboMind engine argument') @@ -188,7 +188,7 @@ def main(): quant_policy=args.quant_policy, num_tokens_per_iter=args.num_tokens_per_iter, max_prefill_iters=args.max_prefill_iters, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, communicator=args.communicator, ) elif args.backend == 'pytorch': @@ -200,7 +200,7 @@ def main(): tp=args.tp, thread_safe=False, eager_mode=args.eager_mode, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, ) engine = Engine(args.model_path, engine_config, csv=args.csv) diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 4fccd4531c..5a26c7a603 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -209,7 +209,7 @@ def parse_args(): session_len_act = ArgumentHelper.session_len(pt_group, default=4096) cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group) cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0) dtype_act = ArgumentHelper.dtype(pt_group) @@ -248,7 +248,7 @@ def main(): quant_policy=args.quant_policy, num_tokens_per_iter=args.num_tokens_per_iter, max_prefill_iters=args.max_prefill_iters, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, dtype=args.dtype, communicator=args.communicator, ) @@ -260,7 +260,7 @@ def main(): max_batch_size=args.concurrency, tp=args.tp, eager_mode=args.eager_mode, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, quant_policy=args.quant_policy, dtype=args.dtype, distributed_executor_backend=args.distributed_executor_backend, diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py index 6d594e1d75..59e0482b91 100644 --- a/lmdeploy/cli/cli.py +++ b/lmdeploy/cli/cli.py @@ -104,7 +104,7 @@ def add_parser_chat(): tp_act = ArgumentHelper.tp(pt_group) session_len_act = ArgumentHelper.session_len(pt_group) cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) quant_policy = ArgumentHelper.quant_policy(pt_group) # turbomind args @@ -237,7 +237,7 @@ def chat(args): session_len=args.session_len, cache_max_entry_count=args.cache_max_entry_count, adapters=adapters, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, device_type=args.device, eager_mode=args.eager_mode, quant_policy=args.quant_policy) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 0cac965633..9b37ca16ca 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -60,7 +60,7 @@ def add_parser_gradio(): max_batch_size_act = ArgumentHelper.max_batch_size(pt_group) cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group) cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group) # turbomind args tb_group = parser.add_argument_group('TurboMind engine arguments') @@ -160,7 +160,7 @@ def add_parser_api_server(): max_batch_size_act = ArgumentHelper.max_batch_size(pt_group) cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group) cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) + prefix_caching_act = ArgumentHelper.disable_prefix_caching(pt_group) max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(pt_group) quant_policy = ArgumentHelper.quant_policy(pt_group) ArgumentHelper.dp(pt_group) @@ -250,7 +250,7 @@ def gradio(args): cache_max_entry_count=args.cache_max_entry_count, block_size=args.cache_block_seq_len, session_len=args.session_len, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, device_type=args.device, quant_policy=args.quant_policy, eager_mode=args.eager_mode, @@ -265,7 +265,7 @@ def gradio(args): rope_scaling_factor=args.rope_scaling_factor, cache_max_entry_count=args.cache_max_entry_count, cache_block_seq_len=args.cache_block_seq_len, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, max_prefill_token_num=args.max_prefill_token_num, communicator=args.communicator) chat_template_config = get_chat_template(args.chat_template) @@ -303,7 +303,7 @@ def api_server(args): block_size=args.cache_block_seq_len, session_len=args.session_len, adapters=adapters, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, device_type=args.device, quant_policy=args.quant_policy, eager_mode=args.eager_mode, @@ -319,7 +319,7 @@ def api_server(args): rope_scaling_factor=args.rope_scaling_factor, cache_max_entry_count=args.cache_max_entry_count, cache_block_seq_len=args.cache_block_seq_len, - enable_prefix_caching=args.enable_prefix_caching, + enable_prefix_caching=not args.disable_prefix_caching, max_prefill_token_num=args.max_prefill_token_num, communicator=args.communicator) chat_template_config = get_chat_template(args.chat_template) diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 0ea38a9153..622fd33fda 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -453,13 +453,13 @@ def cache_block_seq_len(parser): 'be ignored') @staticmethod - def enable_prefix_caching(parser): + def disable_prefix_caching(parser): """Add argument enable_prefix_caching to parser.""" - return parser.add_argument('--enable-prefix-caching', + return parser.add_argument('--disable-prefix-caching', action='store_true', default=False, - help='Enable cache and match prefix') + help='Disable prefix caching') @staticmethod def num_tokens_per_iter(parser): diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index d3b0efccd6..85f78ca122 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -192,7 +192,7 @@ class TurbomindEngineConfig: cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64 enable_prefix_caching (bool): enable cache prompts for block reuse, - default to False + default to True quant_policy (int): default to 0. When k/v is quantized into 4 or 8 bit, set it to 4 or 8, respectively rope_scaling_factor (float): scaling factor used for dynamic ntk, @@ -228,7 +228,7 @@ class TurbomindEngineConfig: cache_max_entry_count: float = 0.8 cache_chunk_size: int = -1 cache_block_seq_len: int = 64 - enable_prefix_caching: bool = False + enable_prefix_caching: bool = True quant_policy: int = 0 rope_scaling_factor: float = 0.0 use_logn_attn: bool = False @@ -313,7 +313,7 @@ class PytorchEngineConfig: adapters: Dict[str, str] = None max_prefill_token_num: int = 4096 thread_safe: bool = False - enable_prefix_caching: bool = False + enable_prefix_caching: bool = True device_type: str = 'cuda' eager_mode: bool = False custom_module_map: Dict[str, str] = None diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py index 9c9840439b..1227e897aa 100644 --- a/lmdeploy/pytorch/config.py +++ b/lmdeploy/pytorch/config.py @@ -76,7 +76,7 @@ class CacheConfig: window_size: int = -1 cache_max_entry_count: float = 0.8 max_prefill_token_num: int = 4096 - enable_prefix_caching: bool = False + enable_prefix_caching: bool = True quant_policy: Literal[0, 4, 8] = 0 device_type: str = 'cuda' From 7e13a189208eee97dfaac785a9a559699f883926 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 8 Apr 2025 13:22:46 +0800 Subject: [PATCH 29/34] merge pt chat.py and tm chat.py --- lmdeploy/cli/chat.py | 82 ++++++++++++++++++++++++++++++++++ lmdeploy/cli/cli.py | 38 ++-------------- lmdeploy/serve/async_engine.py | 3 +- 3 files changed, 88 insertions(+), 35 deletions(-) create mode 100644 lmdeploy/cli/chat.py diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py new file mode 100644 index 0000000000..0b08ce85b9 --- /dev/null +++ b/lmdeploy/cli/chat.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import fire + +from lmdeploy import ChatTemplateConfig, GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline +from lmdeploy.archs import autoget_backend + + +def input_prompt(): + """Input a prompt in the consolo interface.""" + print('\ndouble enter to end input >>> ', end='') + sentinel = '' # ends when this string is seen + return '\n'.join(iter(input, sentinel)) + + +def build_pipe(model_path, backend, **kwargs): + engine_config = None + if backend == 'turbomind': + engine_config = TurbomindEngineConfig() + for key, value in kwargs.items(): + if hasattr(TurbomindEngineConfig, key): + setattr(engine_config, key, value) + else: + engine_config = PytorchEngineConfig() + for key, value in kwargs.items(): + if hasattr(PytorchEngineConfig, key): + setattr(engine_config, key, value) + if kwargs.get("adapters", None): + from .utils import get_lora_adapters + adapters = get_lora_adapters(kwargs['adapters']) + engine_config.adapters = adapters + + chat_template = kwargs.get('chat_template', None) + chat_template_config = None + if chat_template: + chat_template_config = ChatTemplateConfig(model_name=chat_template) + + pipe = pipeline(model_path, backend_config=engine_config, chat_template_config=chat_template_config, **kwargs) + return pipe + + +def build_gen_config(**kwargs): + gen_config = GenerationConfig(max_new_tokens=1024, top_k=40, top_p=0.8, temperature=0.8, repetition_penalty=1.0) + for key, value in kwargs.items(): + if hasattr(GenerationConfig, key): + setattr(gen_config, key, value) + return gen_config + + +def main(model_path, backend, **kwargs): + if backend != 'pytorch': + # set auto backend mode + backend = autoget_backend(model_path) + + pipe = build_pipe(model_path, backend, **kwargs) + gen_config = build_gen_config(**kwargs) + + quit = False + while True: + with pipe.session(gen_config) as sess: + while True: + try: + prompt = input_prompt() + except KeyboardInterrupt: + if not sess._step: + quit = True + print() + break + resps = sess(prompt) + try: + for resp in resps: + print(resp.text, end='', flush=True) + sess.messages.append(role='assistant', content=resp.text) + except KeyboardInterrupt: + pass + finally: + print() + if quit: + break + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py index 59e0482b91..3ae5e57e92 100644 --- a/lmdeploy/cli/cli.py +++ b/lmdeploy/cli/cli.py @@ -4,7 +4,7 @@ import os from ..version import __version__ -from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters +from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args class CLI(object): @@ -218,39 +218,9 @@ def get_gpu_topo(): @staticmethod def chat(args): """Chat with pytorch or turbomind engine.""" - from lmdeploy.archs import autoget_backend - - chat_template_config = get_chat_template(args.chat_template) - - backend = args.backend - if backend != 'pytorch': - # set auto backend mode - backend = autoget_backend(args.model_path) - - if backend == 'pytorch': - from lmdeploy.messages import PytorchEngineConfig - from lmdeploy.pytorch.chat import run_chat - - adapters = get_lora_adapters(args.adapters) - engine_config = PytorchEngineConfig(dtype=args.dtype, - tp=args.tp, - session_len=args.session_len, - cache_max_entry_count=args.cache_max_entry_count, - adapters=adapters, - enable_prefix_caching=not args.disable_prefix_caching, - device_type=args.device, - eager_mode=args.eager_mode, - quant_policy=args.quant_policy) - run_chat(args.model_path, engine_config, chat_template_config=chat_template_config) - else: - from lmdeploy.turbomind.chat import main as run_chat - kwargs = convert_args(args) - kwargs.pop('chat_template') - kwargs.pop('backend') - kwargs.pop('device') - kwargs.pop('eager_mode') - kwargs['chat_template_config'] = chat_template_config - run_chat(**kwargs) + from .chat import main + kwargs = convert_args(args) + main(**kwargs) @staticmethod def add_parsers(): diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 15b866c8e8..0c90c37edf 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -130,7 +130,8 @@ def __call__(self, gen_config: Optional[GenerationConfig] = None, stream_response: bool = True, do_preprocess: bool = True) -> Union[Response, Iterator[Response]]: - self._engine.chat(prompt=prompt, + self.messages.append(dict(role='user', content=prompt)) + self._engine.chat(prompt=self.messages, gen_config=gen_config or self._gen_config, stream_response=stream_response, do_preprocess=do_preprocess, From 22cf30284a4aedeefeeb04f7e4f1c15884f15299 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 8 Apr 2025 21:53:11 +0800 Subject: [PATCH 30/34] remove pt chat.py and tm chat.py --- lmdeploy/cli/chat.py | 22 ++-- lmdeploy/pytorch/chat.py | 154 ---------------------------- lmdeploy/serve/async_engine.py | 6 ++ lmdeploy/turbomind/chat.py | 177 --------------------------------- 4 files changed, 21 insertions(+), 338 deletions(-) delete mode 100644 lmdeploy/pytorch/chat.py delete mode 100644 lmdeploy/turbomind/chat.py diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py index 0b08ce85b9..6d4662f70c 100644 --- a/lmdeploy/cli/chat.py +++ b/lmdeploy/cli/chat.py @@ -11,8 +11,12 @@ def input_prompt(): sentinel = '' # ends when this string is seen return '\n'.join(iter(input, sentinel)) - + def build_pipe(model_path, backend, **kwargs): + # set enable_prefix_cache + disable_prefix_cache = kwargs.pop('disable_prefix_cache', False) + kwargs.update(enable_prefix_caching=not disable_prefix_cache) + # set engine config engine_config = None if backend == 'turbomind': engine_config = TurbomindEngineConfig() @@ -24,11 +28,11 @@ def build_pipe(model_path, backend, **kwargs): for key, value in kwargs.items(): if hasattr(PytorchEngineConfig, key): setattr(engine_config, key, value) - if kwargs.get("adapters", None): + if kwargs.get('adapters', None): from .utils import get_lora_adapters adapters = get_lora_adapters(kwargs['adapters']) engine_config.adapters = adapters - + # set chat template config chat_template = kwargs.get('chat_template', None) chat_template_config = None if chat_template: @@ -61,16 +65,20 @@ def main(model_path, backend, **kwargs): try: prompt = input_prompt() except KeyboardInterrupt: - if not sess._step: - quit = True - print() + quit = True + break + if prompt == 'end': + break + if prompt == 'exit': + quit = True break resps = sess(prompt) try: for resp in resps: print(resp.text, end='', flush=True) - sess.messages.append(role='assistant', content=resp.text) + sess.messages.append(dict(role='assistant', content=resp.text)) except KeyboardInterrupt: + sess.stop() pass finally: print() diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py deleted file mode 100644 index 9035983781..0000000000 --- a/lmdeploy/pytorch/chat.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import asyncio -import os -import random -from typing import Optional - -from lmdeploy.messages import GenerationConfig, PytorchEngineConfig -from lmdeploy.model import ChatTemplateConfig -from lmdeploy.serve.async_engine import get_names_from_model - -os.environ['TM_LOG_LEVEL'] = 'ERROR' - - -def input_prompt(chat_template_name): - """Input a prompt in the consolo interface.""" - if chat_template_name == 'codellama': - print('\nenter !! to end the input >>>\n', end='') - sentinel = '!!' - else: - print('\ndouble enter to end input >>> ', end='') - sentinel = '' # ends when this string is seen - return '\n'.join(iter(input, sentinel)) - - -def run_chat(model_path: str, - engine_config: PytorchEngineConfig, - gen_config: GenerationConfig = None, - session_id: int = 1, - trust_remote_code: bool = True, - chat_template_config: Optional[ChatTemplateConfig] = None): - """An example to perform model inference through the command line - interface. - - Args: - model_path (str): the huggingface model path. - engine_config (PytorchEngineConfig): Config of engine. - gen_config (GenerationConfig): Config of generation. - session_id (int): the identical id of a session. - trust_remote_code (bool): trust remote code. - """ - from lmdeploy import pipeline - - if gen_config is None: - gen_config = GenerationConfig(do_sample=True) - - adapter_name = None - if engine_config.adapters is not None: - adapter_name = next(iter(engine_config.adapters.keys())) - - chat_count = 0 - - def __reset_chat_state(): - """reset chat state.""" - nonlocal chat_count - seed = random.getrandbits(64) - gen_config.random_seed = seed - - async def __generate(prompt: str): - """chat generate.""" - nonlocal chat_count - print() - async for out in pipe.generate( - prompt, - session_id, - gen_config=gen_config, - sequence_start=chat_count == 0, - sequence_end=False, - adapter_name=adapter_name, - ): - print(f'{out.response}', end='', flush=True) - print() - chat_count += 1 - - async def __chat_step(prompt: str): - """chat step.""" - if prompt == 'exit': - exit(0) - elif prompt == 'end': - await pipe.stop_session(session_id) - __reset_chat_state() - else: - await __generate(prompt) - - async def __chat_loop(model_path: str): - """chat loop.""" - __reset_chat_state() - _, chat_template_name = get_names_from_model(model_path) - while True: - prompt = input_prompt(chat_template_name) - await __chat_step(prompt) - - with pipeline( - model_path, - backend_config=engine_config, - chat_template_config=chat_template_config, - ) as pipe: - try: - asyncio.run(__chat_loop(model_path)) - except KeyboardInterrupt: - exit(0) - - -def main(model_path: str, - session_id: int = 1, - top_k: float = 40, - top_p: float = 0.8, - temperature: float = 0.8, - repetition_penalty: float = 1.0, - tp: int = 1, - adapter: str = None, - trust_remote_code: bool = True, - chat_template: str = None): - """An example to perform model inference through the command line - interface. - - Args: - model_path (str): the huggingface model path - session_id (int): the identical id of a session - top_k (int): sampling top k. - top_p (int): sampling top p. - temperature (float): sampling temperature. - repetition_penalty (float): parameter to penalize repetition - tp (int): GPU number used in tensor parallelism - adapter (str): path to lora adapter. - trust_remote_code (bool): Trust remote code. - chat_template (str): A JSON file or string that specifies the - chat template configuration. - """ - adapters = None - if adapter is not None: - adapters = dict(default=adapter) - engine_config = PytorchEngineConfig(tp=tp, adapters=adapters) - gen_config = GenerationConfig(max_new_tokens=512, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - ignore_eos=False) - chat_template_config = None - if chat_template is not None and os.path.exists(chat_template): - chat_template_config = ChatTemplateConfig.from_json(chat_template) - return run_chat(model_path, - engine_config, - gen_config, - session_id=session_id, - trust_remote_code=trust_remote_code, - chat_template_config=chat_template_config) - - -if __name__ == '__main__': - import fire - - fire.Fire(main) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 0c90c37edf..092aa105eb 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -119,6 +119,12 @@ def close(self): self._engine = None self.messages = [] + def stop(self): + """stop the session while tokens are being generated.""" + if self._engine: + self._engine._run(coro=self._engine.stop_session(self._id)).result() + self.messages = [] + def __enter__(self): return self diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py deleted file mode 100644 index fe371d05ca..0000000000 --- a/lmdeploy/turbomind/chat.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import asyncio -import os -import random - -from lmdeploy import Tokenizer -from lmdeploy.archs import get_model_arch -from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig -from lmdeploy.model import ChatTemplateConfig -from lmdeploy.serve.async_engine import get_names_from_model -from lmdeploy.tokenizer import DetokenizeState -from lmdeploy.utils import _get_and_verify_max_len, get_hf_gen_cfg, get_logger - -log_level = 'ERROR' -logger = get_logger('lmdeploy') - -if os.getenv('TM_LOG_LEVEL') is None: - os.environ['TM_LOG_LEVEL'] = log_level - logger.setLevel(log_level) - - -def input_prompt(model_name): - """Input a prompt in the consolo interface.""" - if model_name == 'codellama': - print('\nenter !! to end the input >>>\n', end='') - sentinel = '!!' - else: - print('\ndouble enter to end input >>> ', end='') - sentinel = '' # ends when this string is seen - return '\n'.join(iter(input, sentinel)) - - -async def async_infer(generator, session_id, input_ids, gen_config, stream_output, tokenizer, state): - token_ids = input_ids.copy() - prev_len = 0 - async for output in generator.async_stream_infer(session_id=session_id, - input_ids=input_ids, - gen_config=gen_config, - stream_output=stream_output): - tokens = output.num_token - if tokens > prev_len: - token_ids += output.token_ids[prev_len - tokens:] - response, state = tokenizer.detokenize_incrementally(token_ids, state=state) - prev_len = tokens - print(response, end='', flush=True) - return tokens - - -def main(model_path: str, - session_id: int = 1, - top_k: float = 40, - top_p: float = 0.8, - temperature: float = 0.8, - repetition_penalty: float = 1.0, - cap: str = 'chat', - dtype: str = 'auto', - tp: int = 1, - model_format: str = None, - quant_policy: int = 0, - cache_max_entry_count: float = 0.8, - cache_block_seq_len: int = 64, - rope_scaling_factor: float = 0.0, - enable_prefix_caching: bool = True, - session_len: int = None, - stream_output: bool = True, - request_output_len: int = 1024, - chat_template_config: ChatTemplateConfig = None, - communicator: str = 'nccl', - **kwargs): - """An example to perform model inference through the command line - interface. - - Args: - model_path (str): the path of the deployed model - session_id (int): the identical id of a session - top_k (int): sampling top k. - top_p (int): sampling top p. - temperature (float): sampling temperature. - repetition_penalty (float): parameter to penalize repetition - cap (str): the capability of a model. For example, codellama has the - ability among ['completion', 'infilling', 'chat', 'python'] - dtype (str): data type for model weights and activations. It can be - one of the following values, ['auto', 'float16', 'bfloat16'] - The `auto` option will use FP16 precision for FP32 and FP16 - models, and BF16 precision for BF16 models. - tp (int): GPU number used in tensor parallelism - model_format (str): the layout of the deployed model. It can be one - of the following values [hf, llama, awq] - quant_policy (int): default to 0. When k/v is quantized into 4 or 8 - bit, set it to 4 or 8, respectively - cache_max_entry_count (float): the percentage of gpu memory occupied - by the k/v cache. - cache_block_seq_len (int): the length of the token sequence in a k/v - block, default to 64 - rope_scaling_factor (float): scaling factor used for dynamic ntk, - default to 0. TurboMind follows the implementation of transformer - LlamaAttention - enable_prefix_caching (bool): whether enable prefix caching - session_len (int): the length input output tokens - stream_output (bool): indicator for streaming output or not - request_output_len (int): output token nums - chat_template_config (ChatTemplateConfig): chat template config - kwargs (dict): unused args - """ - - # chat template - _, chat_template_name = get_names_from_model(model_path) - if chat_template_config is None: - chat_template_config = ChatTemplateConfig(chat_template_name) - elif chat_template_config.model_name is None: - chat_template_config.model_name = chat_template_name - if chat_template_config.capability is None: - chat_template_config.capability = cap - print('chat_template_config:\n', chat_template_config, sep='', flush=True) - chat_template = chat_template_config.chat_template - - _, model_config = get_model_arch(model_path) - session_len = _get_and_verify_max_len(model_config, session_len) - - # engine - engine_cfg = TurbomindEngineConfig(max_batch_size=1, - model_format=model_format, - session_len=session_len, - cache_max_entry_count=cache_max_entry_count, - cache_block_seq_len=cache_block_seq_len, - enable_prefix_caching=enable_prefix_caching, - quant_policy=quant_policy, - rope_scaling_factor=rope_scaling_factor, - dtype=dtype, - tp=tp, - communicator=communicator) - print('engine_cfg:\n', engine_cfg, sep='', flush=True) - tokenizer = Tokenizer(model_path) - from lmdeploy import turbomind as tm - tm_model = tm.TurboMind.from_pretrained(model_path, tokenizer=tokenizer, engine_config=engine_cfg) - generator = tm_model.create_instance() - - # generation config - gen_config = GenerationConfig(max_new_tokens=request_output_len, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty) - - hf_gen_cfg = get_hf_gen_cfg(model_path) - gen_config.update_from_hf_gen_cfg(hf_gen_cfg, tokenizer.eos_token_id) - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - seed = random.getrandbits(64) - messages = [] - while True: - user_input = input_prompt(chat_template_name) - if user_input == 'exit': - exit(0) - elif user_input == 'end': - seed = random.getrandbits(64) - messages = [] - else: - messages.append(dict(role='user', content=user_input)) - prompt = chat_template.messages2prompt(messages) - input_ids = tokenizer.encode(prompt) - gen_config.random_seed = seed - - state = DetokenizeState(len(input_ids)) - - coro = async_infer(generator, session_id, input_ids, gen_config, stream_output, tokenizer, state) - loop.run_until_complete(coro) - - print() - - -if __name__ == '__main__': - import fire - - fire.Fire(main) From 8531df8852d1e4410551319ba377ebbc401bd448 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 9 Apr 2025 14:43:16 +0800 Subject: [PATCH 31/34] update --- benchmark/profile_throughput.py | 11 +++++------ lmdeploy/profiler.py | 18 +++++++++++------- lmdeploy/serve/utils.py | 10 ++++++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 4fccd4531c..2bcd81055f 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -77,8 +77,8 @@ def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, Tu self.tm_model = tm_model self.pbar = None - async def _inference(self, req_queue: Queue, session_id: int, temperature: float, top_p: float, top_k: int, - stream_output: bool, skip_tokenize: bool, skip_detokenize: bool): + async def _inference(self, req_queue: Queue, temperature: float, top_p: float, top_k: int, stream_output: bool, + skip_tokenize: bool, skip_detokenize: bool): model_inst = self.tm_model.create_instance() sess: Session = None for prompt, _, output_seqlen, cancel_after, sess in iter(req_queue.get_nowait, None): @@ -95,7 +95,7 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float prev_len = 0 token_ids = input_ids.copy() - generator = model_inst.async_stream_infer(session_id, + generator = model_inst.async_stream_infer(sess.id, input_ids=input_ids, gen_config=GenerationConfig(max_new_tokens=output_seqlen, temperature=temperature, @@ -122,7 +122,7 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float # for pytorch engine to restart a session if isinstance(model_inst, EngineInstance): - await model_inst.async_end(session_id) + await model_inst.async_end(sess.id) self.pbar.update(1) @@ -147,8 +147,7 @@ def process_request(self, requests, profiler: Profiler, concurrency, temperature # start threads tasks = [] for i in range(concurrency): - task = self._inference(req_queue, i, temperature, top_p, top_k, stream_output, skip_tokenize, - skip_detokenize) + task = self._inference(req_queue, temperature, top_p, top_k, stream_output, skip_tokenize, skip_detokenize) tasks.append(task) async def _gather_tasks(tasks): diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index 64cfb07a59..12dcd23609 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -1,29 +1,32 @@ # Copyright (c) OpenMMLab. All rights reserved. import csv import time -from typing import List +from itertools import count +from typing import List, Optional import numpy as np class Session: - UNKNOWN = 0 - SUCCESS = 1 - FAIL = 2 + UNKNOWN: int = 0 + SUCCESS: int = 1 + FAIL: int = 2 + ID = count(0) - def __init__(self, input_len, req_output_len): + def __init__(self, input_len: int, req_output_len: int, session_id: Optional[int] = None): self.ts = [] self.ns = [] self.input_len = input_len self.req_output_len = req_output_len self.status = Session.UNKNOWN + self.id = session_id if session_id else next(Session.ID) - def tick(self, n_token): + def tick(self, n_token: int): self.ts.append(time.perf_counter()) self.ns.append(n_token) - def finish(self, status): + def finish(self, status: int): self.status = status @@ -33,6 +36,7 @@ def __init__(self, stream_output: bool, percentages: List[int]): self.sessions: List[Session] = [] self.stream_output = stream_output self.percentages = percentages + self.session_id = count(0) def new_session(self, *args, **kwargs): sess = Session(*args, **kwargs) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 1fb5997222..1c40ee0126 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -70,11 +70,12 @@ async def _async_get_logits_by_turbomind(self, input_ids, steps, max_input_len): gen_config = GenerationConfig(max_new_tokens=1, output_logits='all', do_sample=False) async def _proc(i): - async with self.model_inst(session_id=i) as inst: + session_id = next(self._session_id) + async with self.model_inst(session_id=session_id) as inst: token_ids = input_ids[i][:steps[i] + max_input_len] input_len = len(token_ids) async with self.safe_run(inst, - session_id=i, + session_id=session_id, input_ids=token_ids, gen_config=gen_config, stream_output=False, @@ -97,14 +98,15 @@ async def _async_get_logits_by_pytorch(self, logits = [None] * len(input_ids) async def _proc(i): - async with self.model_inst(session_id=i) as inst: + session_id = next(self._session_id) + async with self.model_inst(session_id=session_id) as inst: token_ids = input_ids[i][steps[i]:steps[i] + max_input_len] input_len = len(token_ids) # The reason to set `top_k=1` is that pt engine crashes at top_k sampling stage # when perform inference on a reward model. gen_config = GenerationConfig(max_new_tokens=0, output_logits='all', top_k=1) async with self.safe_run(inst, - session_id=i, + session_id=session_id, input_ids=token_ids, gen_config=gen_config, stream_output=False, From f3ef0d4f7097b6ed3ca57b9f633fdbd9e9eae23e Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 9 Apr 2025 16:26:18 +0800 Subject: [PATCH 32/34] fix --- lmdeploy/cli/chat.py | 11 ++- src/turbomind/models/llama/BlockTrie.cc | 2 +- src/turbomind/models/llama/LlamaBatch.cc | 4 +- src/turbomind/models/llama/SequenceManager.cc | 71 +++---------------- src/turbomind/models/llama/SequenceManager.h | 2 - tests/pytorch/paging/test_scheduler.py | 3 +- 6 files changed, 25 insertions(+), 68 deletions(-) diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py index 6d4662f70c..ac25bab442 100644 --- a/lmdeploy/cli/chat.py +++ b/lmdeploy/cli/chat.py @@ -38,7 +38,11 @@ def build_pipe(model_path, backend, **kwargs): if chat_template: chat_template_config = ChatTemplateConfig(model_name=chat_template) - pipe = pipeline(model_path, backend_config=engine_config, chat_template_config=chat_template_config, **kwargs) + pipe = pipeline(model_path, + backend_config=engine_config, + chat_template_config=chat_template_config, + log_level='ERROR', + **kwargs) return pipe @@ -68,6 +72,7 @@ def main(model_path, backend, **kwargs): quit = True break if prompt == 'end': + sess.close() break if prompt == 'exit': quit = True @@ -79,10 +84,10 @@ def main(model_path, backend, **kwargs): sess.messages.append(dict(role='assistant', content=resp.text)) except KeyboardInterrupt: sess.stop() - pass finally: - print() + print('\ncancelling the conversation') if quit: + print('exiting...') break diff --git a/src/turbomind/models/llama/BlockTrie.cc b/src/turbomind/models/llama/BlockTrie.cc index 97462ae7f0..be5ca84604 100644 --- a/src/turbomind/models/llama/BlockTrie.cc +++ b/src/turbomind/models/llama/BlockTrie.cc @@ -111,7 +111,7 @@ void BlockTrie::Remove(const std::vector>& nodes, int if (nodes.empty() || valid_size < 1) { return; } - // visit nodes in reverse order + // visit and remove nodes in reverse order for (int idx = nodes.size() - 1; idx >= valid_size; --idx) { auto child = nodes[idx]; auto parent = nodes[idx - 1]; diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index a6b9ae8e02..e671d71ad3 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -195,7 +195,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorGet(r->id); + auto ptr = sequence_manager_->Create(r->id); if (!ptr) { signals.push_back([r] { UpdateState(*r, Request::kInvalid, 0); }); continue; @@ -1539,6 +1539,8 @@ auto LlamaBatch::Interrupt(int index, bool force_stop) -> Signal state_->sequences[index] = nullptr; + FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id)); + auto ec = std::exchange(state_->errors[index], Request::kOk); const auto len = state_->requests[index]->sequence_length.getVal(); diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index a3e1f010ab..34eeb29e68 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -71,42 +71,6 @@ const Sequence* SequenceManager::Create(uint64_t id) return &it->second; } -const Sequence* SequenceManager::Get(uint64_t id) -{ - auto it = sequences_.find(id); - if (!block_trie_) { - // when prefix_caching is not enabled, check if the id exists. If so, remove the older one - if (it != sequences_.end()) { - if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][Get] Removing conflicting ID %llu", id); - } - Erase(it); - } - } - else { - if (it != sequences_.end()) { - if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][Get] Reuse ID %llu, reset the mutable variables of the sequence", id); - } - auto& seq = it->second; - seq.prompt.clear(); - seq.tokens.clear(); - seq.cache_len = 0; - seq.random_state.clear(); - seq.rope_theta = 0.f; - seq.input_embeddings.clear(); - seq.input_embedding_ranges.clear(); - return &it->second; - } - } - Sequence sequence{id}; - it = sequences_.emplace_hint(it, id, std::move(sequence)); - if (rank_ == 0) { - TM_LOG_INFO("[SeqMgr][Get] Create ID %llu", id); - } - return &it->second; -} - bool SequenceManager::Contains(uint64_t id) { return sequences_.find(id) != sequences_.end(); @@ -122,14 +86,18 @@ void SequenceManager::Erase(std::map::iterator& it) else { UpdateAndSetUnlock(seq); } - - it = sequences_.erase(it); - if (block_trie_) { + // if prefix cache enabled, blocks will be shared by sequences, cannot be freed immediately + if (!block_trie_) { + freed_.insert(freed_.end(), seq.blocks.begin(), seq.blocks.end()); + } + else { + // prune the invalid nodes in the tree auto is_valid = [this](int block_id, uint64_t block_unique_id) -> bool { return this->block_manager_->unique_id(block_id) == block_unique_id; }; block_trie_->Prune(is_valid); } + (void)sequences_.erase(it); } bool SequenceManager::Erase(uint64_t id) @@ -169,7 +137,7 @@ void SequenceManager::CachePrompt(const Sequences& sequences, int active_size) vector2string(block_ids).c_str(), vector2string(block_unique_ids).c_str()); } - // remove invalid nodes from trie tree if there is any + // remove invalid nodes from the path in the trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(nodes, valid); } @@ -197,7 +165,7 @@ void SequenceManager::CacheGeneration(const Sequence& seq) vector2string(block_ids).c_str(), vector2string(block_unique_ids).c_str()); } - // remove invalid nodes from trie tree if there is any + // remove invalid nodes from the path in the trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(nodes, valid); } @@ -478,7 +446,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) std::tie(block_ids, unique_ids, matched_nodes) = block_trie_->Match(seq); int valid = block_manager_->Verify(block_ids, unique_ids); - // remove invalid nodes from trie tree if there is any + // remove invalid nodes from the path in the trie tree if there is any if (valid < block_ids.size()) { block_trie_->Remove(matched_nodes, valid); } @@ -495,24 +463,7 @@ void SequenceManager::PrefixMatch(Sequences& sequences) vector2string(unique_ids).c_str()); } - if (!seq.blocks.empty()) { - // seq.cache_len == 0 but seq.blocks is not empty. It means the new seq reuses an older seq's ID - // So we should UNLOCK the unmatched blocks and reset seq.blocks as matched_blockes - BlockIds unmatched_ids; - std::set_difference(seq.blocks.begin(), - seq.blocks.end(), - matched_ids.begin(), - matched_ids.end(), - std::inserter(unmatched_ids, unmatched_ids.begin())); - block_manager_->Unlock(unmatched_ids); - seq.blocks.clear(); - seq.block_unique_ids.clear(); - if (rank_ == 0) { - TM_LOG_INFO("[SegMgr][match] ID %llu, unlock unmatched blocks %d", seq.id, unmatched_ids.size()); - TM_LOG_DEBUG( - "[SegMgr][match] ID %llu, unmatched block_ids %s", seq.id, vector2string(unmatched_ids).c_str()); - } - } + FT_CHECK(seq.blocks.empty()); seq.cache_len = valid * block_seq_len_; seq.blocks.insert(seq.blocks.end(), block_ids.begin(), block_ids.begin() + valid); seq.block_unique_ids.insert(seq.block_unique_ids.end(), unique_ids.begin(), unique_ids.begin() + valid); diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index f6b31031ff..5c68d05313 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -88,8 +88,6 @@ class SequenceManager { [[nodiscard]] const Sequence* Create(uint64_t id); - [[nodiscard]] const Sequence* Get(uint64_t id); - [[nodiscard]] bool Contains(uint64_t id); [[nodiscard]] bool Erase(uint64_t id); diff --git a/tests/pytorch/paging/test_scheduler.py b/tests/pytorch/paging/test_scheduler.py index f14ab8249e..f8b1d65c05 100644 --- a/tests/pytorch/paging/test_scheduler.py +++ b/tests/pytorch/paging/test_scheduler.py @@ -25,7 +25,8 @@ def cache_config(self, block_size, num_cpu_blocks, num_gpu_blocks): yield CacheConfig(max_batches=256, block_size=block_size, num_cpu_blocks=num_cpu_blocks, - num_gpu_blocks=num_gpu_blocks) + num_gpu_blocks=num_gpu_blocks, + enable_prefix_caching=False) @pytest.fixture def scheduler_config(self): From 87dfbb94d798dd51ac260fc40b9194d6e360b413 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 9 Apr 2025 18:50:41 +0800 Subject: [PATCH 33/34] update --- lmdeploy/api.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lmdeploy/api.py b/lmdeploy/api.py index 3377e7800b..8c064c1975 100644 --- a/lmdeploy/api.py +++ b/lmdeploy/api.py @@ -69,10 +69,6 @@ def pipeline(model_path: str, model_path = get_model(model_path, download_dir, revision) task, pipeline_class = get_task(model_path) - if task == 'vlm': - if backend_config and backend_config.enable_prefix_caching: - backend_config.enable_prefix_caching = False - logger.warning('VLM does not support prefix caching.') if type(backend_config) is not PytorchEngineConfig: # set auto backend mode @@ -80,6 +76,11 @@ def pipeline(model_path: str, backend = 'pytorch' if type(backend_config) is PytorchEngineConfig else 'turbomind' logger.info(f'Using {backend} engine') + if task == 'vlm': + if backend_config and backend_config.enable_prefix_caching: + backend_config.enable_prefix_caching = False + logger.warning('VLM does not support prefix caching.') + return pipeline_class(model_path, backend=backend, backend_config=backend_config, From 61f2f0ac6672aa307a310a8e3640a072624b6006 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 12 May 2025 20:53:53 +0800 Subject: [PATCH 34/34] update --- lmdeploy/serve/openai/api_server.py | 14 +++++++++----- lmdeploy/serve/utils.py | 4 ++-- src/turbomind/models/llama/LlamaBatch.cc | 8 ++++---- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 9bd6049e5d..8a991e5cec 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -120,17 +120,17 @@ def create_error_response(status: HTTPStatus, message: str, error_type='invalid_ async def check_request(request) -> Optional[JSONResponse]: """Check if a request is valid.""" if hasattr(request, 'model') and request.model not in get_model_list(): - return create_error_response(HTTPStatus.NOT_FOUND, f'The model "{request.model}" does not exist.') + return create_error_response(HTTPStatus.NOT_FOUND, f'The model `{request.model}` does not exist.') if hasattr(request, 'n') and request.n <= 0: - return create_error_response(HTTPStatus.BAD_REQUEST, f'The n "{request.n}" must be a positive int.') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The n `{request.n}` must be a positive int.') if hasattr(request, 'top_p') and not (request.top_p > 0 and request.top_p <= 1): - return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p "{request.top_p}" must be in (0, 1].') + return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p `{request.top_p}` must be in (0, 1].') if hasattr(request, 'top_k') and request.top_k < 0: return create_error_response(HTTPStatus.BAD_REQUEST, - f'The top_k "{request.top_k}" cannot be a negative integer.') + f'The top_k `{request.top_k}` cannot be a negative integer.') if hasattr(request, 'temperature') and not (request.temperature <= 2 and request.temperature >= 0): return create_error_response(HTTPStatus.BAD_REQUEST, - f'The temperature "{request.temperature}" must be in [0, 2]') + f'The temperature `{request.temperature}` must be in [0, 2]') return @@ -1113,6 +1113,10 @@ def serve(model_path: str, if proxy_url is not None: VariableInterface.proxy_url = proxy_url VariableInterface.api_server_url = f'{http_or_https}://{server_name}:{server_port}' # noqa + for i in range(3): + print(f'HINT: Please open \033[93m\033[1m{http_or_https}://' + f'{server_name}:{server_port}\033[0m in a browser for detailed api' + ' usage!!!') uvicorn.run(app=app, host=server_name, port=server_port, diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 7bdee7f4da..3cf9ba409c 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -120,7 +120,7 @@ async def _proc(i): session_ids = list(range(len(input_ids))) tasks = [_proc(i) for i in range(len(input_ids))] await asyncio.gather(*tasks) - if sequence_end and self.backend == 'pytorch': + if sequence_end: for session_id in session_ids: await self.end_session(session_id) return logits @@ -243,4 +243,4 @@ def _get_ppl(self, input_ids, steps, max_input_len, sequence_start: bool = True, result.append(loss.item() / target_count.item()) target_counts.append(target_count) logger.info(f'ppl result: {result}') - return result + return result, target_counts diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index aa18bf8167..535dc76bed 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -934,14 +934,14 @@ void LlamaBatch::OutputLogits(const Tensor& logits, int first, int last, Generat { const auto& src_buf = logits.buffer(); const auto elem_size = byte_size(logits.dtype(), 1); - // when `is_all` is true, logits only contains last token of the sequences + // when `is_all` is false, logits only contains last token of the sequences const bool is_all = out_type == GenerationConfig::kAll; int base = 0; for (int i = first; i < last; ++i) { - const int input_len = h_input_length_buf_[i]; // input lenght for this iter + const int input_len = h_input_length_buf_[i]; // input length for this iter if (state_->requests[i]->gen_cfg.output_logits == out_type) { @@ -977,10 +977,10 @@ void LlamaBatch::OutputLogits(const Tensor& logits, int first, int last, Generat if (is_all) { // Skip invalid tokens caused by cache miss - src_base += std::max(0, (history_len + offset) - cache_len); + src_base += std::max(0, diff); } // Skip previous chunks - int dst_base = std::max(0, cache_len - (history_len + offset)); + int dst_base = std::max(0, -diff); check_cuda_error(cudaMemcpy2DAsync(dst_buf.raw_data(dst_base * model_->vocab_size_), elem_size * model_->vocab_size_,