{"version":"1.0","workflow_uuid":"556eded4-26f7-4c21-a701-b6c6a117852b","workflow_title":"ExLlamaV2 — Fast Quantized LLM Inference","install_contract":{"version":"1.0","installReady":false,"title":"ExLlamaV2 — Fast Quantized LLM Inference","summary":"ExLlamaV2 runs quantized LLMs on consumer GPUs with optimized CUDA kernels. EXL2/GPTQ/HQQ, PagedAttention, speculative decoding.","assetType":"Scripts","pageUrl":"https://tokrepo.com/en/workflows/exllamav2-fast-quantized-llm-inference-556eded4","sourceUrl":"https://github.com/turboderp/exllamav2","intendedFor":[],"firstActions":[],"agentFirstSteps":[],"targetPaths":[],"verification":[],"startingPoints":[],"example":"","successOutcome":"","boundaries":[],"askUserIf":["the current workspace stack cannot be matched to a safe upstream template","the target path is not the project root, or an existing file should be merged instead of overwritten"]}}