Put more wiggle room. (#3189)

Narsil · web-flow · commit 39cfe232fd58 · 2025-04-24T17:23:32.000+02:00
* Put more wiggle room.

* Fixing the makefile by using lockfile.

* Pre commit
diff --git a/server/Makefile b/server/Makefile
@@ -10,6 +10,7 @@ include Makefile-flashinfer
 unit-tests:
 	pip install -U pip uv
 	uv pip install -e ".[dev]"
+	uv sync --inexact --extra dev --active
 	pytest -s -vv -m "not private" tests
 
 gen-server:
@@ -30,14 +31,14 @@ gen-server-raw:
 	touch text_generation_server/pb/__init__.py
 
 install-server: gen-server
-	uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
+	uv sync --inexact --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active
 
 
 install: install-cuda
 	echo "Installed server"
 
 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	uv pip install -e ".[attention,bnb,marlin,moe]"
+	uv sync --inexact --extra attention --extra bnb --extra marlin --extra moe --active
 	uv pip install nvidia-nccl-cu12==2.22.3
 	kernels download .
 
diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
@@ -28,7 +28,8 @@
     raise RuntimeError("Prefix caching is only supported with flashinfer")
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
-TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93"))
+# Test a 70B model on 4xA100 under load for latest failure
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
 assert TGI_WIGGLE_ROOM > 0
 assert TGI_WIGGLE_ROOM < 1