[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"similar-nomadkaraoke--python-audio-separator":3,"tool-nomadkaraoke--python-audio-separator":64},[4,16,27,35,48,56],{"id":5,"name":6,"github_repo":7,"description_zh":8,"stars":9,"difficulty_score":10,"last_commit_at":11,"category_tags":12,"status":15},8272,"opencode","anomalyco\u002Fopencode","OpenCode 是一款开源的 AI 编程助手（Coding Agent），旨在像一位智能搭档一样融入您的开发流程。它不仅仅是一个代码补全插件，而是一个能够理解项目上下文、自主规划任务并执行复杂编码操作的智能体。无论是生成全新功能、重构现有代码，还是排查难以定位的 Bug，OpenCode 都能通过自然语言交互高效完成，显著减少开发者在重复性劳动和上下文切换上的时间消耗。\n\n这款工具专为软件开发者、工程师及技术研究人员设计，特别适合希望利用大模型能力来提升编码效率、加速原型开发或处理遗留代码维护的专业人群。其核心亮点在于完全开源的架构，这意味着用户可以审查代码逻辑、自定义行为策略，甚至私有化部署以保障数据安全，彻底打破了传统闭源 AI 助手的“黑盒”限制。\n\n在技术体验上，OpenCode 提供了灵活的终端界面（Terminal UI）和正在测试中的桌面应用程序，支持 macOS、Windows 及 Linux 全平台。它兼容多种包管理工具，安装便捷，并能无缝集成到现有的开发环境中。无论您是追求极致控制权的资深极客，还是渴望提升产出的独立开发者，OpenCode 都提供了一个透明、可信",144296,1,"2026-04-16T14:50:03",[13,14],"Agent","插件","ready",{"id":17,"name":18,"github_repo":19,"description_zh":20,"stars":21,"difficulty_score":22,"last_commit_at":23,"category_tags":24,"status":15},6121,"gemini-cli","google-gemini\u002Fgemini-cli","gemini-cli 是一款由谷歌推出的开源 AI 命令行工具，它将强大的 Gemini 大模型能力直接集成到用户的终端环境中。对于习惯在命令行工作的开发者而言，它提供了一条从输入提示词到获取模型响应的最短路径，无需切换窗口即可享受智能辅助。\n\n这款工具主要解决了开发过程中频繁上下文切换的痛点，让用户能在熟悉的终端界面内直接完成代码理解、生成、调试以及自动化运维任务。无论是查询大型代码库、根据草图生成应用，还是执行复杂的 Git 操作，gemini-cli 都能通过自然语言指令高效处理。\n\n它特别适合广大软件工程师、DevOps 人员及技术研究人员使用。其核心亮点包括支持高达 100 万 token 的超长上下文窗口，具备出色的逻辑推理能力；内置 Google 搜索、文件操作及 Shell 命令执行等实用工具；更独特的是，它支持 MCP（模型上下文协议），允许用户灵活扩展自定义集成，连接如图像生成等外部能力。此外，个人谷歌账号即可享受免费的额度支持，且项目基于 Apache 2.0 协议完全开源，是提升终端工作效率的理想助手。",100752,2,"2026-04-10T01:20:03",[14,13,25,26],"图像","开发框架",{"id":28,"name":29,"github_repo":30,"description_zh":31,"stars":32,"difficulty_score":22,"last_commit_at":33,"category_tags":34,"status":15},4721,"markitdown","microsoft\u002Fmarkitdown","MarkItDown 是一款由微软 AutoGen 团队打造的轻量级 Python 工具，专为将各类文件高效转换为 Markdown 格式而设计。它支持 PDF、Word、Excel、PPT、图片（含 OCR）、音频（含语音转录）、HTML 乃至 YouTube 链接等多种格式的解析，能够精准提取文档中的标题、列表、表格和链接等关键结构信息。\n\n在人工智能应用日益普及的今天，大语言模型（LLM）虽擅长处理文本，却难以直接读取复杂的二进制办公文档。MarkItDown 恰好解决了这一痛点，它将非结构化或半结构化的文件转化为模型“原生理解”且 Token 效率极高的 Markdown 格式，成为连接本地文件与 AI 分析 pipeline 的理想桥梁。此外，它还提供了 MCP（模型上下文协议）服务器，可无缝集成到 Claude Desktop 等 LLM 应用中。\n\n这款工具特别适合开发者、数据科学家及 AI 研究人员使用，尤其是那些需要构建文档检索增强生成（RAG）系统、进行批量文本分析或希望让 AI 助手直接“阅读”本地文件的用户。虽然生成的内容也具备一定可读性，但其核心优势在于为机器",93400,"2026-04-06T19:52:38",[14,26],{"id":36,"name":37,"github_repo":38,"description_zh":39,"stars":40,"difficulty_score":22,"last_commit_at":41,"category_tags":42,"status":15},2268,"ML-For-Beginners","microsoft\u002FML-For-Beginners","ML-For-Beginners 是由微软推出的一套系统化机器学习入门课程，旨在帮助零基础用户轻松掌握经典机器学习知识。这套课程将学习路径规划为 12 周，包含 26 节精炼课程和 52 道配套测验，内容涵盖从基础概念到实际应用的完整流程，有效解决了初学者面对庞大知识体系时无从下手、缺乏结构化指导的痛点。\n\n无论是希望转型的开发者、需要补充算法背景的研究人员，还是对人工智能充满好奇的普通爱好者，都能从中受益。课程不仅提供了清晰的理论讲解，还强调动手实践，让用户在循序渐进中建立扎实的技能基础。其独特的亮点在于强大的多语言支持，通过自动化机制提供了包括简体中文在内的 50 多种语言版本，极大地降低了全球不同背景用户的学习门槛。此外，项目采用开源协作模式，社区活跃且内容持续更新，确保学习者能获取前沿且准确的技术资讯。如果你正寻找一条清晰、友好且专业的机器学习入门之路，ML-For-Beginners 将是理想的起点。",85092,"2026-04-10T11:13:16",[25,43,44,14,13,45,46,26,47],"数据工具","视频","其他","语言模型","音频",{"id":49,"name":50,"github_repo":51,"description_zh":52,"stars":53,"difficulty_score":10,"last_commit_at":54,"category_tags":55,"status":15},7525,"codex","openai\u002Fcodex","Codex 是 OpenAI 推出的一款轻量级编程智能体，专为在终端环境中高效运行而设计。它允许开发者直接在命令行界面与 AI 交互，完成代码生成、调试、重构及项目维护等任务，无需频繁切换至浏览器或集成开发环境，从而显著提升了编码流程的连贯性与专注度。\n\n这款工具主要解决了传统 AI 辅助编程中上下文割裂的问题。通过将智能体本地化运行，Codex 能够更紧密地结合当前工作目录的文件结构，提供更具针对性的代码建议，同时支持以自然语言指令驱动复杂的开发操作，让“对话即编码”成为现实。\n\nCodex 非常适合习惯使用命令行的软件工程师、全栈开发者以及技术研究人员。对于追求极致效率、偏好键盘操作胜过图形界面的极客用户而言，它更是理想的结对编程伙伴。\n\n其独特亮点在于灵活的部署方式：既可作为全局命令行工具通过 npm 或 Homebrew 一键安装，也能无缝对接现有的 ChatGPT 订阅计划（如 Plus 或 Pro），直接复用账户权益。此外，它还提供了从纯文本终端到桌面应用的多形态体验，并支持基于 API 密钥的深度定制，充分满足不同场景下的开发需求。",75220,"2026-04-14T14:40:34",[46,13,14],{"id":57,"name":58,"github_repo":59,"description_zh":60,"stars":61,"difficulty_score":22,"last_commit_at":62,"category_tags":63,"status":15},51,"gstack","garrytan\u002Fgstack","gstack 是 Y Combinator CEO Garry Tan 亲自开源的一套 AI 工程化配置，旨在将 Claude Code 升级为你的虚拟工程团队。面对单人开发难以兼顾产品战略、架构设计、代码审查及质量测试的挑战，gstack 提供了一套标准化解决方案，帮助开发者实现堪比二十人团队的高效产出。\n\n这套配置特别适合希望提升交付效率的创始人、技术负责人，以及初次尝试 Claude Code 的开发者。gstack 的核心亮点在于内置了 15 个具有明确职责的 AI 角色工具，涵盖 CEO、设计师、工程经理、QA 等职能。用户只需通过简单的斜杠命令（如 `\u002Freview` 进行代码审查、`\u002Fqa` 执行测试、`\u002Fplan-ceo-review` 规划功能），即可自动化处理从需求分析到部署上线的全链路任务。\n\n所有操作基于 Markdown 和斜杠命令，无需复杂配置，完全免费且遵循 MIT 协议。gstack 不仅是一套工具集，更是一种现代化的软件工厂实践，让单人开发者也能拥有严谨的工程流程。",74909,"2026-04-17T23:08:54",[13,14],{"id":65,"github_repo":66,"name":67,"description_en":68,"description_zh":69,"ai_summary_zh":70,"readme_en":71,"readme_zh":72,"quickstart_zh":73,"use_case_zh":74,"hero_image_url":75,"owner_login":76,"owner_name":77,"owner_avatar_url":78,"owner_bio":79,"owner_company":80,"owner_location":80,"owner_email":81,"owner_twitter":80,"owner_website":82,"owner_url":83,"languages":84,"stars":93,"forks":94,"last_commit_at":95,"license":96,"difficulty_score":97,"env_os":98,"env_gpu":99,"env_ram":100,"env_deps":101,"category_tags":108,"github_topics":80,"view_count":22,"oss_zip_url":80,"oss_zip_packed_at":80,"status":15,"created_at":109,"updated_at":110,"faqs":111,"releases":141},8674,"nomadkaraoke\u002Fpython-audio-separator","python-audio-separator","Easy to use stem (e.g. instrumental\u002Fvocals) separation from CLI or as a python package, using a variety of amazing pre-trained models (primarily from UVR)","python-audio-separator 是一款功能强大的开源音频处理工具，旨在通过命令行或 Python 代码轻松将混合音频分离为独立的音轨（如人声、伴奏、鼓点、贝斯等）。它有效解决了音乐制作、视频剪辑及卡拉 OK 场景中难以提取特定声音元素的痛点，让用户能快速获取纯净的人声或伴奏文件，甚至进行降噪和去除混响等专业处理。\n\n这款工具非常适合开发者集成到自己的项目中，也适合研究人员测试不同分离模型，同时其便捷的命令行操作让普通用户无需深厚编程背景也能上手使用。其核心亮点在于集成了 UVR（Ultimate Vocal Remover）社区中多种顶尖的预训练模型，包括 MDX-Net、VR Arch、Demucs 及最新的 MDXC 架构，确保了业界领先的分离质量。此外，python-audio-separator 具备出色的兼容性，不仅支持 NVIDIA GPU 加速和 Google Colab 云端运行，还专门优化了苹果 Silicon 芯片（M1\u002FM2 等）的 CoreML 加速，即使在无独立显卡的设备上也能高效运转。无论是想制作伴奏的音乐爱好者，还是需要批量处理音频的开发团队","python-audio-separator 是一款功能强大的开源音频处理工具，旨在通过命令行或 Python 代码轻松将混合音频分离为独立的音轨（如人声、伴奏、鼓点、贝斯等）。它有效解决了音乐制作、视频剪辑及卡拉 OK 场景中难以提取特定声音元素的痛点，让用户能快速获取纯净的人声或伴奏文件，甚至进行降噪和去除混响等专业处理。\n\n这款工具非常适合开发者集成到自己的项目中，也适合研究人员测试不同分离模型，同时其便捷的命令行操作让普通用户无需深厚编程背景也能上手使用。其核心亮点在于集成了 UVR（Ultimate Vocal Remover）社区中多种顶尖的预训练模型，包括 MDX-Net、VR Arch、Demucs 及最新的 MDXC 架构，确保了业界领先的分离质量。此外，python-audio-separator 具备出色的兼容性，不仅支持 NVIDIA GPU 加速和 Google Colab 云端运行，还专门优化了苹果 Silicon 芯片（M1\u002FM2 等）的 CoreML 加速，即使在无独立显卡的设备上也能高效运转。无论是想制作伴奏的音乐爱好者，还是需要批量处理音频的开发团队，它都是一个灵活且可靠的选择。","\u003Cdiv align=\"center\">\n\n# 🎶 Audio Separator 🎶\n\n[![PyPI version](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Faudio-separator.svg)](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Faudio-separator)\n[![Conda Version](https:\u002F\u002Fimg.shields.io\u002Fconda\u002Fvn\u002Fconda-forge\u002Faudio-separator.svg)](https:\u002F\u002Fanaconda.org\u002Fconda-forge\u002Faudio-separator)\n[![Docker pulls](https:\u002F\u002Fimg.shields.io\u002Fdocker\u002Fpulls\u002Fbeveradb\u002Faudio-separator.svg)](https:\u002F\u002Fhub.docker.com\u002Fr\u002Fbeveradb\u002Faudio-separator\u002Ftags)\n[![codecov](https:\u002F\u002Fcodecov.io\u002Fgh\u002Fkaraokenerds\u002Fpython-audio-separator\u002Fgraph\u002Fbadge.svg?token=N7YK4ET5JP)](https:\u002F\u002Fcodecov.io\u002Fgh\u002Fkaraokenerds\u002Fpython-audio-separator)\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1gSlmSmna7f7fH6OjsiMEDLl-aJ9kGPkY?usp=sharing)\n[![Open In Huggingface](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhuggingface\u002Fbadges\u002Fresolve\u002Fmain\u002Fopen-in-hf-spaces-sm.svg)](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fnomadkaraoke\u002Faudio-separator)\n\n\u003C\u002Fdiv>\n\n**Summary:** Easy to use audio stem separation from the command line or as a dependency in your own Python project, using the amazing MDX-Net, VR Arch, Demucs and MDXC models available in UVR by @Anjok07 & @aufr33.\n\nAudio Separator is a Python package that allows you to separate an audio file into various stems, using models trained by @Anjok07 for use with [Ultimate Vocal Remover](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui).\n\nThe simplest (and probably most used) use case for this package is to separate an audio file into two stems, Instrumental and Vocals, which can be very useful for producing karaoke videos! However, the models available in UVR can separate audio into many more stems, such as Drums, Bass, Piano, and Guitar, and perform other audio processing tasks, such as denoising or removing echo\u002Freverb.\n\n\u003Cdetails>\n\u003Csummary align=\"center\">\u003Cb>Table of Contents\u003C\u002Fdiv>\u003C\u002Fb>\u003C\u002Fsummary>\n\n- [🎶 Audio Separator 🎶](#-audio-separator-)\n  - [Features](#features)\n  - [Installation 🛠️](#installation-%EF%B8%8F)\n    - [🐳 Docker](#-docker)\n    - [🎮 Nvidia GPU with CUDA or 🧪 Google Colab](#-nvidia-gpu-with-cuda-or--google-colab)\n    - [ Apple Silicon, macOS Sonoma+ with M1 or newer CPU (CoreML acceleration)](#-apple-silicon-macos-sonoma-with-m1-or-newer-cpu-coreml-acceleration)\n    - [🐢 No hardware acceleration, CPU only](#-no-hardware-acceleration-cpu-only)\n    - [🎥 FFmpeg dependency](#-ffmpeg-dependency)\n  - [GPU \u002F CUDA specific installation steps with Pip](#gpu--cuda-specific-installation-steps-with-pip)\n    - [Multiple CUDA library versions may be needed](#multiple-cuda-library-versions-may-be-needed)\n  - [Usage 🚀](#usage-)\n    - [Command Line Interface (CLI)](#command-line-interface-cli)\n    - [Listing and Filtering Available Models](#listing-and-filtering-available-models)\n      - [Filtering Models](#filtering-models)\n      - [Limiting Results](#limiting-results)\n      - [JSON Output](#json-output)\n    - [Full command-line interface options](#full-command-line-interface-options)\n    - [As a Dependency in a Python Project](#as-a-dependency-in-a-python-project)\n      - [Batch processing and processing with multiple models](#batch-processing-and-processing-with-multiple-models)\n      - [Renaming Stems](#renaming-stems)\n  - [Parameters for the Separator class](#parameters-for-the-separator-class)\n  - [Remote API Usage 🌐](#remote-api-usage-)\n  - [Requirements 📋](#requirements-)\n  - [Developing Locally](#developing-locally)\n    - [Prerequisites](#prerequisites)\n    - [Clone the Repository](#clone-the-repository)\n    - [Create and activate the Conda Environment](#create-and-activate-the-conda-environment)\n    - [Install Dependencies](#install-dependencies)\n    - [Running the Command-Line Interface Locally](#running-the-command-line-interface-locally)\n    - [Deactivate the Virtual Environment](#deactivate-the-virtual-environment)\n    - [Building the Package](#building-the-package)\n  - [Contributing 🤝](#contributing-)\n  - [License 📄](#license-)\n  - [Credits 🙏](#credits-)\n  - [Contact 💌](#contact-)\n  - [Thanks to all contributors for their efforts](#thanks-to-all-contributors-for-their-efforts)\n\u003C\u002Fdetails>\n\n---\n\n## Features\n\n- Separate audio into multiple stems, e.g. instrumental and vocals.\n- Supports all common audio formats (WAV, MP3, FLAC, M4A, etc.)\n- Ability to inference using a pre-trained model in PTH or ONNX format.\n- CLI support for easy use in scripts and batch processing.\n- Python API for integration into other projects.\n\n## Installation 🛠️\n\n### 🐳 Docker\n\nIf you're able to use docker, you don't actually need to _install_ anything - there are [images published on Docker Hub](https:\u002F\u002Fhub.docker.com\u002Fr\u002Fbeveradb\u002Faudio-separator\u002Ftags) for GPU (CUDA) and CPU inferencing, for both `amd64` and `arm64` platforms.\n\nYou probably want to volume-mount a folder containing whatever file you want to separate, which can then also be used as the output folder.\n\nFor instance, if your current directory has the file `input.wav`, you could execute `audio-separator` as shown below (see [usage](#usage-) section for more details):\n\n```sh\ndocker run -it -v `pwd`:\u002Fworkdir beveradb\u002Faudio-separator input.wav\n```\n\nIf you're using a machine with a GPU, you'll want to use the GPU specific image and pass in the GPU device to the container, like this:\n\n```sh\ndocker run -it --gpus all -v `pwd`:\u002Fworkdir beveradb\u002Faudio-separator:gpu input.wav\n```\n\nIf the GPU isn't being detected, make sure your docker runtime environment is passing through the GPU correctly - there are [various guides](https:\u002F\u002Fwww.celantur.com\u002Fblog\u002Frun-cuda-in-docker-on-linux\u002F) online to help with that.\n\n### 🎮 Nvidia GPU with CUDA or 🧪 Google Colab\n\n**Supported CUDA Versions:** 11.8 and 12.2\n\n💬 If successfully configured, you should see this log message when running `audio-separator --env_info`:\n `ONNXruntime has CUDAExecutionProvider available, enabling acceleration`\n\nConda:\n```sh\nconda install pytorch=*=*cuda* onnxruntime=*=*cuda* audio-separator -c pytorch -c conda-forge\n```\n\nPip:\n```sh\npip install \"audio-separator[gpu]\"\n```\n\nDocker:\n```sh\nbeveradb\u002Faudio-separator:gpu\n```\n\n###  Apple Silicon, macOS Sonoma+ with M1 or newer CPU (CoreML acceleration)\n\n💬 If successfully configured, you should see this log message when running `audio-separator --env_info`:\n `ONNXruntime has CoreMLExecutionProvider available, enabling acceleration`\n\nPip:\n```sh\npip install \"audio-separator[cpu]\"\n```\n\n### 🐢 No hardware acceleration, CPU only\n\nConda:\n```sh\nconda install audio-separator -c pytorch -c conda-forge\n```\n\nPip:\n```sh\npip install \"audio-separator[cpu]\"\n```\n\nDocker:\n```sh\nbeveradb\u002Faudio-separator\n```\n\n### 🎥 FFmpeg dependency\n\n💬 To test if `audio-separator` has been successfully configured to use FFmpeg, run `audio-separator --env_info`. The log will show `FFmpeg installed`.\n\nIf you installed `audio-separator` using `conda` or `docker`, FFmpeg should already be available in your environment.\n\nYou may need to separately install FFmpeg. It should be easy to install on most platforms, e.g.:\n\n🐧 Debian\u002FUbuntu:\n```sh\napt-get update; apt-get install -y ffmpeg\n```\n\n macOS:\n```sh\nbrew update; brew install ffmpeg\n```\n\n## GPU \u002F CUDA specific installation steps with Pip\n\nIn theory, all you should need to do to get `audio-separator` working with a GPU is install it with the `[gpu]` extra as above.\n\nHowever, sometimes getting both PyTorch and ONNX Runtime working with CUDA support can be a bit tricky so it may not work that easily.\n\nYou may need to reinstall both packages directly, allowing pip to calculate the right versions for your platform, for example:\n\n- `pip uninstall torch onnxruntime`\n- `pip cache purge`\n- `pip install --force-reinstall torch torchvision torchaudio`\n- `pip install --force-reinstall onnxruntime-gpu`\n\nI generally recommend installing the latest version of PyTorch for your environment using the command recommended by the wizard here:\n\u003Chttps:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F>\n\n### Multiple CUDA library versions may be needed\n\nDepending on your CUDA version and environment, you may need to install specific version(s) of CUDA libraries for ONNX Runtime to use your GPU.\n\n🧪 Google Colab, for example, now uses CUDA 12 by default, but ONNX Runtime still needs CUDA 11 libraries to work.\n\nIf you see the error `Failed to load library` or `cannot open shared object file` when you run `audio-separator`, this is likely the issue.\n\nYou can install the CUDA 11 libraries _alongside_ CUDA 12 like so:\n```sh\napt update; apt install nvidia-cuda-toolkit\n```\n\nIf you encounter the following messages when running on Google Colab or in another environment:\n```\n[E:onnxruntime:Default, provider_bridge_ort.cc:1862 TryGetProviderInfo_CUDA] \u002Fonnxruntime_src\u002Fonnxruntime\u002Fcore\u002Fsession\u002Fprovider_bridge_ort.cc:1539 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcudnn_adv.so.9: cannot open shared object file: No such file or directory\n\n[W:onnxruntime:Default, onnxruntime_pybind_state.cc:993 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Fexecution-providers\u002FCUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.\n```\nYou can resolve this by running the following command:\n```sh\npython -m pip install ort-nightly-gpu --index-url=https:\u002F\u002Faiinfra.pkgs.visualstudio.com\u002FPublicPackages\u002F_packaging\u002Fort-cuda-12-nightly\u002Fpypi\u002Fsimple\u002F\n```\n\n> Note: if anyone knows how to make this cleaner so we can support both different platform-specific dependencies for hardware acceleration without a separate installation process for each, please let me know or raise a PR!\n\n## Usage 🚀\n\n### Command Line Interface (CLI)\n\nYou can use Audio Separator via the command line, for example:\n\n```sh\naudio-separator \u002Fpath\u002Fto\u002Fyour\u002Finput\u002Faudio.wav --model_filename model_bs_roformer_ep_317_sdr_12.9755.ckpt\n```\n\nThis command will download the specified model file, process the `audio.wav` input audio and generate two new files in the current directory, one containing vocals and one containing instrumental.\n\n**Note:** You do not need to download any files yourself - audio-separator does that automatically for you!\n\nTo see a list of supported models, run `audio-separator --list_models`\n\nAny file listed in the list models output can be specified (with file extension) with the model_filename parameter (e.g. `--model_filename UVR_MDXNET_KARA_2.onnx`) and it will be automatically downloaded to the `--model_file_dir` (default: `\u002Ftmp\u002Faudio-separator-models\u002F`) folder on first usage.\n\n### Listing and Filtering Available Models\n\nYou can view all available models using the `--list_models` (or `-l`) flag:\n\n```sh\naudio-separator --list_models\n```\n\nThe output shows a table with the following columns:\n- Model Filename: The filename to use with `--model_filename`\n- Arch: The model architecture (MDX, MDXC, Demucs, etc.)\n- Output Stems (SDR): The stems this model can separate, with Signal-to-Distortion Ratio scores where available\n- Friendly Name: A human-readable name describing the model\n\n#### Filtering Models\n\nYou can filter and sort the model list by stem type using `--list_filter`. For example, to find models that can separate drums:\n\n```sh\naudio-separator -l --list_filter=drums\n```\n\nExample output:\n```\n-----------------------------------------------------------------------------------------------------------------------------------\nModel Filename        Arch    Output Stems (SDR)                                            Friendly Name\n-----------------------------------------------------------------------------------------------------------------------------------\nhtdemucs_ft.yaml      Demucs  vocals (10.8), drums (10.1), bass (11.9), other               Demucs v4: htdemucs_ft\nhdemucs_mmi.yaml      Demucs  vocals (10.3), drums (9.7), bass (12.0), other                Demucs v4: hdemucs_mmi\nhtdemucs.yaml         Demucs  vocals (10.0), drums (9.4), bass (11.3), other                Demucs v4: htdemucs\nhtdemucs_6s.yaml      Demucs  vocals (9.7), drums (8.5), bass (10.0), guitar, piano, other  Demucs v4: htdemucs_6s\n```\n\n#### Limiting Results\n\nYou can limit the number of results shown using `--list_limit`. This is useful for finding the best performing models for a particular stem. For example, to see the top 5 vocal separation models:\n\n```sh\naudio-separator -l --list_filter=vocals --list_limit=5\n```\n\nExample output:\n```\n--------------------------------------------------------------------------------------------------------------------------------------------------------------\nModel Filename                             Arch  Output Stems (SDR)                   Friendly Name\n--------------------------------------------------------------------------------------------------------------------------------------------------------------\nmodel_bs_roformer_ep_317_sdr_12.9755.ckpt  MDXC  vocals* (12.9), instrumental (17.0)  Roformer Model: BS-Roformer-Viperx-1297\nmodel_bs_roformer_ep_368_sdr_12.9628.ckpt  MDXC  vocals* (12.9), instrumental (17.0)  Roformer Model: BS-Roformer-Viperx-1296\nvocals_mel_band_roformer.ckpt              MDXC  vocals* (12.6), other                Roformer Model: MelBand Roformer | Vocals by Kimberley Jensen\nmelband_roformer_big_beta4.ckpt            MDXC  vocals* (12.5), other                Roformer Model: MelBand Roformer Kim | Big Beta 4 FT by unwa\nmel_band_roformer_kim_ft_unwa.ckpt         MDXC  vocals* (12.4), other                Roformer Model: MelBand Roformer Kim | FT by unwa\n```\n\n#### JSON Output\n\nFor programmatic use, you can output the model list in JSON format:\n\n```sh\naudio-separator -l --list_format=json\n```\n\n### Processing Large Files\n\nFor very long audio files (>1 hour), you may encounter out-of-memory errors. The `--chunk_duration` option automatically splits large files into smaller chunks, processes them separately, and merges the results:\n\n```sh\n# Process an 8-hour podcast in 10-minute chunks\naudio-separator long_podcast.wav --chunk_duration 600\n\n# Adjust chunk size based on available memory\naudio-separator very_long_audio.wav --chunk_duration 300  # 5-minute chunks\n```\n\n#### How It Works\n\n1. **Split**: The input file is split into fixed-duration chunks (e.g., 10 minutes)\n2. **Process**: Each chunk is processed separately, reducing peak memory usage\n3. **Merge**: The results are merged back together with simple concatenation\n\nThe chunking feature supports all model types:\n- **2-stem models** (e.g., MDX): Vocals + Instrumental\n- **4-stem models** (e.g., Demucs): Drums, Bass, Other, Vocals\n- **6-stem models** (e.g., Demucs 6s): Bass, Drums, Other, Vocals, Guitar, Piano\n\n#### Benefits\n\n- **Prevents OOM errors**: Process files of any length without running out of memory\n- **Predictable memory usage**: Memory usage stays bounded regardless of file length\n- **No quality loss**: Each chunk is fully processed with the selected model\n- **Multi-stem support**: Works seamlessly with 2, 4, and 6-stem models\n\n#### Recommendations\n\n- **Files > 1 hour**: Use `--chunk_duration 600` (10 minutes)\n- **Limited memory systems**: Use smaller chunks (300-600 seconds)\n- **Ample memory**: You may not need chunking at all\n\n#### Note on Audio Quality\n\nChunks are concatenated without crossfading, which may result in minor artifacts at chunk boundaries in rare cases. For most use cases, these are not noticeable. The simple concatenation approach keeps processing time minimal while solving out-of-memory issues.\n\n### Ensembling Multiple Models\n\nYou can combine the results of multiple models to improve separation quality. This will run each model and then combine their outputs using a specified algorithm.\n\n#### CLI Usage\n\nUse `-m` for the primary model and `--extra_models` for additional models. You can also specify the ensemble algorithm using `--ensemble_algorithm`.\n\n```sh\n# Ensemble two models using the default 'avg_wave' algorithm\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx\n\n# Ensemble multiple models using a specific algorithm\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx model3.ckpt --ensemble_algorithm max_fft\n\n# With custom weights (must match the number of models)\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx --ensemble_weights 2.0 1.0\n```\n\n#### Python API Usage\n\n```python\nfrom audio_separator.separator import Separator\n\n# Initialize the Separator class with custom parameters\nseparator = Separator(\n    output_dir='output',\n    ensemble_algorithm='avg_wave'\n)\n\n# List of models to ensemble\n# Note: These models will be downloaded automatically if not present\nmodels = [\n    'UVR-MDX-NET-Inst_HQ_3.onnx',\n    'UVR_MDXNET_KARA_2.onnx'\n]\n\n# Specify multiple models for ensembling\nseparator.load_model(model_filename=models)\n\n# Perform separation\noutput_files = separator.separate('audio.wav')\n```\n\n#### Supported Ensemble Algorithms\n- `avg_wave`: Weighted average of waveforms (default)\n- `median_wave`: Median of waveforms\n- `min_wave`: Minimum of waveforms\n- `max_wave`: Maximum of waveforms\n- `avg_fft`: Weighted average of spectrograms\n- `median_fft`: Median of spectrograms\n- `min_fft`: Minimum of spectrograms\n- `max_fft`: Maximum of spectrograms\n- `uvr_max_spec`: UVR-based maximum spectrogram ensemble\n- `uvr_min_spec`: UVR-based minimum spectrogram ensemble\n- `ensemble_wav`: UVR-based least noisy chunk ensemble\n\n#### Ensemble Presets\n\nInstead of specifying models and algorithms manually, you can use curated presets based on community-tested combinations:\n\n```sh\n# List available presets\naudio-separator --list_presets\n\n# Use a preset (models and algorithm are configured automatically)\naudio-separator audio.wav --ensemble_preset vocal_balanced\n\n# Override a preset's algorithm\naudio-separator audio.wav --ensemble_preset vocal_balanced --ensemble_algorithm max_fft\n```\n\n**Python API:**\n```python\nseparator = Separator(output_dir='output', ensemble_preset='vocal_balanced')\nseparator.load_model()  # Uses preset's models automatically\noutput_files = separator.separate('audio.wav')\n```\n\nAvailable presets:\n\n| Preset | Use Case | Models | Algorithm |\n|--------|----------|--------|-----------|\n| `instrumental_clean` | Cleanest instrumentals, minimal vocal bleed | 2 | `uvr_max_spec` |\n| `instrumental_full` | Maximum instrument preservation | 2 | `uvr_max_spec` |\n| `instrumental_balanced` | Good noise\u002Ffullness balance | 2 | `uvr_max_spec` |\n| `instrumental_low_resource` | Fast, low VRAM | 2 | `avg_fft` |\n| `vocal_balanced` | Best overall vocal quality | 2 | `avg_fft` |\n| `vocal_clean` | Minimal instrument bleed | 2 | `min_fft` |\n| `vocal_full` | Maximum vocal capture | 2 | `max_fft` |\n| `vocal_rvc` | Optimized for RVC\u002FAI training | 2 | `avg_wave` |\n| `karaoke` | Lead vocal removal | 3 | `avg_wave` |\n\nPresets are defined in `audio_separator\u002Fensemble_presets.json` — contributions welcome via PR!\n\n### Full command-line interface options\n\n```sh\nusage: audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [--list_filter LIST_FILTER] [--list_limit LIST_LIMIT] [--list_format {pretty,json}] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT]\n                       [--output_bitrate OUTPUT_BITRATE] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--download_model_only] [--invert_spect] [--normalization NORMALIZATION]\n                       [--amplification AMPLIFICATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--use_soundfile] [--use_autocast] [--custom_output_names CUSTOM_OUTPUT_NAMES]\n                       [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE] [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE]\n                       [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta] [--vr_high_end_process] [--vr_enable_post_process]\n                       [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS] [--demucs_overlap DEMUCS_OVERLAP]\n                       [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_override_model_segment_size] [--mdxc_overlap MDXC_OVERLAP]\n                       [--mdxc_batch_size MDXC_BATCH_SIZE] [--mdxc_pitch_shift MDXC_PITCH_SHIFT]\n                       [audio_files ...]\n\nSeparate audio file into different stems.\n\npositional arguments:\n  audio_files                                            The audio file paths or directory to separate, in any common format.\n\noptions:\n  -h, --help                                             show this help message and exit\n\nInfo and Debugging:\n  -v, --version                                          Show the program's version number and exit.\n  -d, --debug                                            Enable debug logging, equivalent to --log_level=debug.\n  -e, --env_info                                         Print environment information and exit.\n  -l, --list_models                                      List all supported models and exit. Use --list_filter to filter\u002Fsort the list and --list_limit to show only top N results.\n  --log_level LOG_LEVEL                                  Log level, e.g. info, debug, warning (default: info).\n  --list_filter LIST_FILTER                              Filter and sort the model list by 'name', 'filename', or any stem e.g. vocals, instrumental, drums\n  --list_limit LIST_LIMIT                                Limit the number of models shown\n  --list_format {pretty,json}                            Format for listing models: 'pretty' for formatted output, 'json' for raw JSON dump\n\nSeparation I\u002FO Params:\n  -m MODEL_FILENAME, --model_filename MODEL_FILENAME     Model to use for separation (default: model_bs_roformer_ep_317_sdr_12.9755.yaml). Example: -m 2_HP-UVR.pth\n  --output_format OUTPUT_FORMAT                          Output format for separated files, any common format (default: FLAC). Example: --output_format=MP3\n  --output_bitrate OUTPUT_BITRATE                        Output bitrate for separated files, any ffmpeg-compatible bitrate (default: None). Example: --output_bitrate=320k\n  --output_dir OUTPUT_DIR                                Directory to write output files (default: \u003Ccurrent dir>). Example: --output_dir=\u002Fapp\u002Fseparated\n  --model_file_dir MODEL_FILE_DIR                        Model files directory (default: \u002Ftmp\u002Faudio-separator-models\u002F). Example: --model_file_dir=\u002Fapp\u002Fmodels\n  --download_model_only                                  Download a single model file only, without performing separation.\n\nCommon Separation Parameters:\n  --invert_spect                                         Invert secondary stem using spectrogram (default: False). Example: --invert_spect\n  --normalization NORMALIZATION                          Max peak amplitude to normalize input and output audio to (default: 0.9). Example: --normalization=0.7\n  --amplification AMPLIFICATION                          Min peak amplitude to amplify input and output audio to (default: 0.0). Example: --amplification=0.4\n  --single_stem SINGLE_STEM                              Output only single stem, e.g. Instrumental, Vocals, Drums, Bass, Guitar, Piano, Other. Example: --single_stem=Instrumental\n  --sample_rate SAMPLE_RATE                              Modify the sample rate of the output audio (default: 44100). Example: --sample_rate=44100\n  --use_soundfile                                        Use soundfile to write audio output (default: False). Example: --use_soundfile\n  --use_autocast                                         Use PyTorch autocast for faster inference (default: False). Do not use for CPU inference. Example: --use_autocast\n  --custom_output_names CUSTOM_OUTPUT_NAMES              Custom names for all output files in JSON format (default: None). Example: --custom_output_names='{\"Vocals\": \"vocals_output\", \"Drums\": \"drums_output\"}'\n\nMDX Architecture Parameters:\n  --mdx_segment_size MDX_SEGMENT_SIZE                    Larger consumes more resources, but may give better results (default: 256). Example: --mdx_segment_size=256\n  --mdx_overlap MDX_OVERLAP                              Amount of overlap between prediction windows, 0.001-0.999. Higher is better but slower (default: 0.25). Example: --mdx_overlap=0.25\n  --mdx_batch_size MDX_BATCH_SIZE                        Larger consumes more RAM but may process slightly faster (default: 1). Example: --mdx_batch_size=4\n  --mdx_hop_length MDX_HOP_LENGTH                        Usually called stride in neural networks, only change if you know what you're doing (default: 1024). Example: --mdx_hop_length=1024\n  --mdx_enable_denoise                                   Enable denoising during separation (default: False). Example: --mdx_enable_denoise\n\nVR Architecture Parameters:\n  --vr_batch_size VR_BATCH_SIZE                          Number of batches to process at a time. Higher = more RAM, slightly faster processing (default: 1). Example: --vr_batch_size=16\n  --vr_window_size VR_WINDOW_SIZE                        Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality. (default: 512). Example: --vr_window_size=320\n  --vr_aggression VR_AGGRESSION                          Intensity of primary stem extraction, -100 - 100. Typically, 5 for vocals & instrumentals (default: 5). Example: --vr_aggression=2\n  --vr_enable_tta                                        Enable Test-Time-Augmentation; slow but improves quality (default: False). Example: --vr_enable_tta\n  --vr_high_end_process                                  Mirror the missing frequency range of the output (default: False). Example: --vr_high_end_process\n  --vr_enable_post_process                               Identify leftover artifacts within vocal output; may improve separation for some songs (default: False). Example: --vr_enable_post_process\n  --vr_post_process_threshold VR_POST_PROCESS_THRESHOLD  Threshold for post_process feature: 0.1-0.3 (default: 0.2). Example: --vr_post_process_threshold=0.1\n\nDemucs Architecture Parameters:\n  --demucs_segment_size DEMUCS_SEGMENT_SIZE              Size of segments into which the audio is split, 1-100. Higher = slower but better quality (default: Default). Example: --demucs_segment_size=256\n  --demucs_shifts DEMUCS_SHIFTS                          Number of predictions with random shifts, higher = slower but better quality (default: 2). Example: --demucs_shifts=4\n  --demucs_overlap DEMUCS_OVERLAP                        Overlap between prediction windows, 0.001-0.999. Higher = slower but better quality (default: 0.25). Example: --demucs_overlap=0.25\n  --demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED      Enable segment-wise processing (default: True). Example: --demucs_segments_enabled=False\n\nMDXC Architecture Parameters:\n  --mdxc_segment_size MDXC_SEGMENT_SIZE                  Larger consumes more resources, but may give better results (default: 256). Example: --mdxc_segment_size=256\n  --mdxc_override_model_segment_size                     Override model default segment size instead of using the model default value. Example: --mdxc_override_model_segment_size\n  --mdxc_overlap MDXC_OVERLAP                            Amount of overlap between prediction windows, 2-50. Higher is better but slower (default: 8). Example: --mdxc_overlap=8\n  --mdxc_batch_size MDXC_BATCH_SIZE                      Larger consumes more RAM but may process slightly faster (default: 1). Example: --mdxc_batch_size=4\n  --mdxc_pitch_shift MDXC_PITCH_SHIFT                    Shift audio pitch by a number of semitones while processing. May improve output for deep\u002Fhigh vocals. (default: 0). Example: --mdxc_pitch_shift=2\n```\n\n### As a Dependency in a Python Project\n\nYou can use Audio Separator in your own Python project. Here's a minimal example using the default two stem (Instrumental and Vocals) model:\n\n```python\nfrom audio_separator.separator import Separator\n\n# Initialize the Separator class (with optional configuration properties, below)\nseparator = Separator()\n\n# Load a machine learning model (if unspecified, defaults to 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt')\nseparator.load_model()\n\n# Perform the separation on specific audio files without reloading the model\noutput_files = separator.separate('audio1.wav')\n\nprint(f\"Separation complete! Output file(s): {' '.join(output_files)}\")\n```\n\n#### Batch processing and processing with multiple models\n\nYou can process multiple files without reloading the model to save time and memory.\n\nYou only need to load a model when choosing or changing models. See example below:\n\n```python\nfrom audio_separator.separator import Separator\n\n# Initialize the Separator class (with optional configuration properties, below)\nseparator = Separator()\n\n# Load a model\nseparator.load_model(model_filename='model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n\n# Separate multiple audio files without reloading the model\noutput_files = separator.separate(['audio1.wav', 'audio2.wav', 'audio3.wav'])\n\n# Load a different model\nseparator.load_model(model_filename='UVR_MDXNET_KARA_2.onnx')\n\n# Separate the same files with the new model\noutput_files = separator.separate(['audio1.wav', 'audio2.wav', 'audio3.wav'])\n```\n\nYou can also specify the path to a folder containing audio files instead of listing the full paths to each of them:\n```python\nfrom audio_separator.separator import Separator\n\n# Initialize the Separator class (with optional configuration properties, below)\nseparator = Separator()\n\n# Load a model\nseparator.load_model(model_filename='model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n\n# Separate all audio files located in a folder\noutput_files = separator.separate('path\u002Fto\u002Faudio_directory')\n```\n\n#### Renaming Stems\n\nYou can rename the output files by specifying the desired names. For example:\n```python\noutput_names = {\n    \"Vocals\": \"vocals_output\",\n    \"Instrumental\": \"instrumental_output\",\n}\noutput_files = separator.separate('audio1.wav', output_names)\n```\nIn this case, the output file names will be: `vocals_output.wav` and `instrumental_output.wav`.\n\nYou can also rename specific stems:\n\n- To rename the Vocals stem:\n  ```python\n  output_names = {\n      \"Vocals\": \"vocals_output\",\n  }\n  output_files = separator.separate('audio1.wav', output_names)\n  ```\n  > The output files will be named: `vocals_output.wav` and `audio1_(Instrumental)_model_mel_band_roformer_ep_3005_sdr_11.wav`\n- To rename the Instrumental stem:\n  ```python\n  output_names = {\n      \"Instrumental\": \"instrumental_output\",\n  }\n  output_files = separator.separate('audio1.wav', output_names)\n  ```\n  > The output files will be named: `audio1_(Vocals)_model_mel_band_roformer_ep_3005_sdr_11.wav` and `instrumental_output.wav`\n- List of stems for Demucs models:\n  - htdemucs_6s.yaml\n    ```python\n    output_names = {\n        \"Vocals\": \"vocals_output\",\n        \"Drums\": \"drums_output\",\n        \"Bass\": \"bass_output\",\n        \"Other\": \"other_output\",\n        \"Guitar\": \"guitar_output\",\n        \"Piano\": \"piano_output\",\n    }\n    ```\n  - Other Demucs models\n    ```python\n    output_names = {\n        \"Vocals\": \"vocals_output\",\n        \"Drums\": \"drums_output\",\n        \"Bass\": \"bass_output\",\n        \"Other\": \"other_output\",\n    }\n    ```\n\n## Parameters for the Separator class\n\n- **`log_level`:** (Optional) Logging level, e.g., INFO, DEBUG, WARNING. `Default: logging.INFO`\n- **`log_formatter`:** (Optional) The log format. Default: None, which falls back to '%(asctime)s - %(levelname)s - %(module)s - %(message)s'\n- **`model_file_dir`:** (Optional) Directory to cache model files in. `Default: \u002Ftmp\u002Faudio-separator-models\u002F`\n- **`output_dir`:** (Optional) Directory where the separated files will be saved. If not specified, uses the current directory.\n- **`output_format`:** (Optional) Format to encode output files, any common format (WAV, MP3, FLAC, M4A, etc.). `Default: WAV`\n- **`normalization_threshold`:** (Optional) The amount by which the amplitude of the output audio will be multiplied. `Default: 0.9`\n- **`amplification_threshold`:** (Optional) The minimum amplitude level at which the waveform will be amplified. If the peak amplitude of the audio is below this threshold, the waveform will be scaled up to meet it. `Default: 0.0`\n- **`output_single_stem`:** (Optional) Output only a single stem, such as 'Instrumental' and 'Vocals'. `Default: None`\n- **`invert_using_spec`:** (Optional) Flag to invert using spectrogram. `Default: False`\n- **`sample_rate`:** (Optional) Set the sample rate of the output audio. `Default: 44100`\n- **`use_soundfile`:** (Optional) Use soundfile for output writing, can solve OOM issues, especially on longer audio.\n- **`use_autocast`:** (Optional) Flag to use PyTorch autocast for faster inference. Do not use for CPU inference. `Default: False`\n- **`mdx_params`:** (Optional) MDX Architecture Specific Attributes & Defaults. `Default: {\"hop_length\": 1024, \"segment_size\": 256, \"overlap\": 0.25, \"batch_size\": 1, \"enable_denoise\": False}`\n- **`vr_params`:** (Optional) VR Architecture Specific Attributes & Defaults. `Default: {\"batch_size\": 1, \"window_size\": 512, \"aggression\": 5, \"enable_tta\": False, \"enable_post_process\": False, \"post_process_threshold\": 0.2, \"high_end_process\": False}`\n- **`demucs_params`:** (Optional) Demucs Architecture Specific Attributes & Defaults. `Default: {\"segment_size\": \"Default\", \"shifts\": 2, \"overlap\": 0.25, \"segments_enabled\": True}` _(Note: `segment_size` \"Default\" uses the model's internal default, typically 40 for older Demucs models and 10 for Demucs v4\u002Fhtdemucs)_\n- **`mdxc_params`:** (Optional) MDXC Architecture Specific Attributes & Defaults. `Default: {\"segment_size\": 256, \"override_model_segment_size\": False, \"batch_size\": 1, \"overlap\": 8, \"pitch_shift\": 0}`\n- **`ensemble_algorithm`:** (Optional) Algorithm to use for ensembling multiple models. `Default: 'avg_wave'`\n- **`ensemble_weights`:** (Optional) Weights for each model in the ensemble. `Default: None` (equal weights)\n- **`ensemble_preset`:** (Optional) Named ensemble preset (e.g. `'vocal_balanced'`, `'karaoke'`). Sets models, algorithm, and weights automatically. Use `Separator(info_only=True).list_ensemble_presets()` to see all. `Default: None`\n\n## Remote API Usage 🌐\n\nAudio Separator includes a remote API client that allows you to connect to a deployed Audio Separator API service, enabling you to perform audio separation without running the models locally. The API uses asynchronous processing with job polling for efficient handling of separation tasks.\n\nTo deploy Audio Separator as an API on modal.com and use this for remote processing, please see the detailed documentation here: [audio_separator\u002Fremote\u002FREADME.md](audio_separator\u002Fremote\u002FREADME.md).\n\n## Requirements 📋\n\nPython >= 3.10\n\nLibraries: torch, onnx, onnxruntime, numpy, librosa, requests, six, tqdm, pydub\n\n## Developing Locally\n\nThis project uses Poetry for dependency management and packaging. Follow these steps to setup a local development environment:\n\n### Prerequisites\n\n- Make sure you have Python 3.10 or newer installed on your machine.\n- Install Conda (I recommend Miniforge: [Miniforge GitHub](https:\u002F\u002Fgithub.com\u002Fconda-forge\u002Fminiforge)) to manage your Python virtual environments\n\n### Clone the Repository\n\nClone the repository to your local machine:\n\n```sh\ngit clone https:\u002F\u002Fgithub.com\u002FYOUR_USERNAME\u002Faudio-separator.git\ncd audio-separator\n```\n\nReplace `YOUR_USERNAME` with your GitHub username if you've forked the repository, or use the main repository URL if you have the permissions.\n\n### Create and activate the Conda Environment\n\nTo create and activate the conda environment, use the following commands:\n\n```sh\nconda env create\nconda activate audio-separator-dev\n```\n\n### Install Dependencies\n\nOnce you're inside the conda env, run the following command to install the project dependencies:\n\n```sh\npoetry install\n```\n\nInstall extra dependencies depending if you're running with GPU or CPU.\n```sh\npoetry install --extras \"cpu\"\n```\nor\n```sh\npoetry install --extras \"gpu\"\n```\nor\n```sh\npoetry install --extras \"dml\"\n```\n\n### Running the Command-Line Interface Locally\n\nYou can run the CLI command directly within the virtual environment. For example:\n\n```sh\naudio-separator path\u002Fto\u002Fyour\u002Faudio-file.wav\n```\n\n### Deactivate the Virtual Environment\n\nOnce you are done with your development work, you can exit the virtual environment by simply typing:\n\n```sh\nconda deactivate\n```\n\n### Building the Package\n\nTo build the package for distribution, use the following command:\n\n```sh\npoetry build\n```\n\nThis will generate the distribution packages in the dist directory - but for now only @beveradb will be able to publish to PyPI.\n\n\n## Contributing 🤝\n\nContributions are very much welcome! Please fork the repository and submit a pull request with your changes, and I'll try to review, merge and publish promptly!\n\n- This project is 100% open-source and free for anyone to use and modify as they wish.\n- If the maintenance workload for this repo somehow becomes too much for me I'll ask for volunteers to share maintainership of the repo, though I don't think that is very likely\n- Development and support for the MDX-Net separation models is part of the main [UVR project](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui), this repo is just a CLI\u002FPython package wrapper to simplify running those models programmatically. So, if you want to try and improve the actual models, please get involved in the UVR project and look for guidance there!\n\n## License 📄\n\nThis project is licensed under the MIT [License](LICENSE).\n\n- **Please Note:** If you choose to integrate this project into some other project using the default model or any other model trained as part of the [UVR](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui) project, please honor the MIT license by providing credit to UVR and its developers!\n\n## Credits 🙏\n\n- [Anjok07](https:\u002F\u002Fgithub.com\u002FAnjok07) - Author of [Ultimate Vocal Remover GUI](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui), which almost all of the code in this repo was copied from! Definitely deserving of credit for anything good from this project. Thank you!\n- [DilanBoskan](https:\u002F\u002Fgithub.com\u002FDilanBoskan) - Your contributions at the start of this project were essential to the success of UVR. Thank you!\n- [Kuielab & Woosung Choi](https:\u002F\u002Fgithub.com\u002Fkuielab) - Developed the original MDX-Net AI code.\n- [KimberleyJSN](https:\u002F\u002Fgithub.com\u002FKimberleyJensen) - Advised and aided the implementation of the training scripts for MDX-Net and Demucs. Thank you!\n- [Hv](https:\u002F\u002Fgithub.com\u002FNaJeongMo\u002FColab-for-MDX_B) - Helped implement chunks into the MDX-Net AI code. Thank you!\n- [zhzhongshi](https:\u002F\u002Fgithub.com\u002Fzhzhongshi) - Helped add support for the MDXC models in `audio-separator`. Thank you!\n\n## Contact 💌\n\nFor questions or feedback, please raise an issue or reach out to @beveradb ([Andrew Beveridge](mailto:andrew@beveridge.uk)) directly.\n\n---\n\u003Cdiv align=\"center\">\n\n\u003C!-- sponsors -->\u003C!-- sponsors -->\n\n## Thanks to all contributors for their efforts\n\n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fgraphs\u002Fcontributors\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fnomadkaraoke_python-audio-separator_readme_a77de88dc878.png\" \u002F>\n\u003C\u002Fa>\n\n\u003C\u002Fdiv>","\u003Cdiv align=\"center\">\n\n# 🎶 音频分离器 🎶\n\n[![PyPI version](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Faudio-separator.svg)](https:\u002F\u002Fbadge.fury.io\u002Fpy\u002Faudio-separator)\n[![Conda Version](https:\u002F\u002Fimg.shields.io\u002Fconda\u002Fvn\u002Fconda-forge\u002Faudio-separator.svg)](https:\u002F\u002Fanaconda.org\u002Fconda-forge\u002Faudio-separator)\n[![Docker pulls](https:\u002F\u002Fimg.shields.io\u002Fdocker\u002Fpulls\u002Fbeveradb\u002Faudio-separator.svg)](https:\u002F\u002Fhub.docker.com\u002Fr\u002Fbeveradb\u002Faudio-separator\u002Ftags)\n[![codecov](https:\u002F\u002Fcodecov.io\u002Fgh\u002Fkaraokenerds\u002Fpython-audio-separator\u002Fgraph\u002Fbadge.svg?token=N7YK4ET5JP)](https:\u002F\u002Fcodecov.io\u002Fgh\u002Fkaraokenerds\u002Fpython-audio-separator)\n[![Open In Colab](https:\u002F\u002Fcolab.research.google.com\u002Fassets\u002Fcolab-badge.svg)](https:\u002F\u002Fcolab.research.google.com\u002Fdrive\u002F1gSlmSmna7f7fH6OjsiMEDLl-aJ9kGPkY?usp=sharing)\n[![Open In Huggingface](https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fhuggingface\u002Fbadges\u002Fresolve\u002Fmain\u002Fopen-in-hf-spaces-sm.svg)](https:\u002F\u002Fhuggingface.co\u002Fspaces\u002Fnomadkaraoke\u002Faudio-separator)\n\n\u003C\u002Fdiv>\n\n**摘要:** 通过命令行或作为您自己的 Python 项目中的依赖项，轻松使用音频音轨分离功能。该工具基于 @Anjok07 和 @aufr33 在 UVR 中提供的强大 MDX-Net、VR Arch、Demucs 和 MDXC 模型。\n\n音频分离器是一个 Python 包，允许您使用 @Anjok07 训练的模型，将音频文件分离成多个音轨，这些模型专为 [Ultimate Vocal Remover](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui) 设计。  \n\n这个包最简单（也是最常见的）用法是将音频文件分离成伴奏和人声两个音轨，这对于制作卡拉 OK 视频非常有用！不过，UVR 中提供的模型还可以将音频分离成更多音轨，例如鼓、贝斯、钢琴和吉他等，同时还能执行降噪或去除回声\u002F混响等其他音频处理任务。\n\n\u003Cdetails>\n\u003Csummary align=\"center\">\u003Cb>目录\u003C\u002Fdiv>\u003C\u002Fb>\u003C\u002Fsummary>\n\n- [🎶 音频分离器 🎶](#-audio-separator-)\n  - [特性](#features)\n  - [安装 🛠️](#installation-%EF%B8%8F)\n    - [🐳 Docker](#-docker)\n    - [🎮 带有 CUDA 的 Nvidia GPU 或 🧪 Google Colab](#-nvidia-gpu-with-cuda-or--google-colab)\n    - [ Apple Silicon, macOS Sonoma+ 使用 M1 或更高版本 CPU（CoreML 加速）](#-apple-silicon-macos-sonoma-with-m1-or-newer-cpu-coreml-acceleration)\n    - [🐢 无硬件加速，仅使用 CPU](#-no-hardware-acceleration-cpu-only)\n    - [🎥 FFmpeg 依赖](#-ffmpeg-dependency)\n  - [GPU \u002F CUDA 特定的 Pip 安装步骤](#gpu--cuda-specific-installation-steps-with-pip)\n    - [可能需要多个 CUDA 库版本](#multiple-cuda-library-versions-may-be-needed)\n  - [使用 🚀](#usage-)\n    - [命令行界面 (CLI)](#command-line-interface-cli)\n    - [列出并筛选可用模型](#listing-and-filtering-available-models)\n      - [筛选模型](#filtering-models)\n      - [限制结果](#limiting-results)\n      - [JSON 输出](#json-output)\n    - [完整的命令行选项](#full-command-line-interface-options)\n    - [作为 Python 项目的依赖项](#as-a-dependency-in-a-python-project)\n      - [批量处理及使用多个模型进行处理](#batch-processing-and-processing-with-multiple-models)\n      - [重命名音轨](#renaming-stems)\n  - [Separator 类的参数](#parameters-for-the-separator-class)\n  - [远程 API 使用 🌐](#remote-api-usage-)\n  - [要求 📋](#requirements-)\n  - [本地开发](#developing-locally)\n    - [先决条件](#prerequisites)\n    - [克隆仓库](#clone-the-repository)\n    - [创建并激活 Conda 环境](#create-and-activate-the-conda-environment)\n    - [安装依赖](#install-dependencies)\n    - [在本地运行命令行界面](#running-the-command-line-interface-locally)\n    - [停用虚拟环境](#deactivate-the-virtual-environment)\n    - [构建软件包](#building-the-package)\n  - [贡献 🤝](#contributing-)\n  - [许可证 📄](#license-)\n  - [致谢 🙏](#credits-)\n  - [联系方式 💌](#contact-)\n  - [感谢所有贡献者的努力](#thanks-to-all-contributors-for-their-efforts)\n\u003C\u002Fdetails>\n\n---\n\n## 特性\n\n- 将音频分离成多个音轨，例如伴奏和人声。\n- 支持所有常见音频格式（WAV、MP3、FLAC、M4A 等）。\n- 可以使用 PTH 或 ONNX 格式的预训练模型进行推理。\n- 提供 CLI 支持，便于在脚本中使用和批量处理。\n- 提供 Python API，方便集成到其他项目中。\n\n## 安装 🛠️\n\n### 🐳 Docker\n\n如果您可以使用 Docker，则无需实际安装任何东西——Docker Hub 上已发布适用于 GPU（CUDA）和 CPU 推理的镜像，支持 `amd64` 和 `arm64` 平台。\n\n您可能希望挂载一个包含待分离文件的目录，并将其用作输出目录。\n\n例如，如果当前目录下有一个名为 `input.wav` 的文件，您可以按如下方式运行 `audio-separator`（更多详情请参阅 [使用部分](#usage-)）：\n\n```sh\ndocker run -it -v `pwd`:\u002Fworkdir beveradb\u002Faudio-separator input.wav\n```\n\n如果您使用的是带有 GPU 的机器，则应使用 GPU 特定的镜像，并将 GPU 设备传递给容器，如下所示：\n\n```sh\ndocker run -it --gpus all -v `pwd`:\u002Fworkdir beveradb\u002Faudio-separator:gpu input.wav\n```\n\n如果未检测到 GPU，请确保您的 Docker 运行时环境正确地透传了 GPU——网上有许多指南可以帮助您完成此操作，例如 [这篇教程](https:\u002F\u002Fwww.celantur.com\u002Fblog\u002Frun-cuda-in-docker-on-linux\u002F)。\n\n### 🎮 带有 CUDA 的 Nvidia GPU 或 🧪 Google Colab\n\n**支持的 CUDA 版本:** 11.8 和 12.2\n\n💬 如果配置成功，运行 `audio-separator --env_info` 时应会看到以下日志消息：\n `ONNXruntime 已启用 CUDAExecutionProvider，可实现加速`\n\nConda:\n```sh\nconda install pytorch=*=*cuda* onnxruntime=*=*cuda* audio-separator -c pytorch -c conda-forge\n```\n\nPip:\n```sh\npip install \"audio-separator[gpu]\"\n```\n\nDocker:\n```sh\nbeveradb\u002Faudio-separator:gpu\n```\n\n###  Apple Silicon, macOS Sonoma+ 使用 M1 或更高版本 CPU（CoreML 加速）\n\n💬 如果配置成功，运行 `audio-separator --env_info` 时应会看到以下日志消息：\n `ONNXruntime 已启用 CoreMLExecutionProvider，可实现加速`\n\nPip:\n```sh\npip install \"audio-separator[cpu]\"\n```\n\n### 🐢 无硬件加速，仅使用 CPU\n\nConda:\n```sh\nconda install audio-separator -c pytorch -c conda-forge\n```\n\nPip:\n```sh\npip install \"audio-separator[cpu]\"\n```\n\nDocker:\n```sh\nbeveradb\u002Faudio-separator\n```\n\n### 🎥 FFmpeg 依赖\n\n💬 要测试 `audio-separator` 是否已成功配置为使用 FFmpeg，请运行 `audio-separator --env_info`。日志中应显示 `FFmpeg installed`。\n\n如果您使用 `conda` 或 `docker` 安装了 `audio-separator`，那么您的环境中应该已经具备 FFmpeg。\n\n您可能需要单独安装 FFmpeg。在大多数平台上，安装都非常简单，例如：\n\n🐧 Debian\u002FUbuntu：\n```sh\napt-get update; apt-get install -y ffmpeg\n```\n\n macOS：\n```sh\nbrew update; brew install ffmpeg\n```\n\n## 使用 Pip 进行 GPU \u002F CUDA 特定的安装步骤\n\n理论上，为了让 `audio-separator` 在 GPU 上运行，您只需按照上述方式使用 `[gpu]` 附加组件进行安装即可。\n\n然而，有时同时让 PyTorch 和 ONNX Runtime 支持 CUDA 可能会有些棘手，因此未必能够顺利运行。\n\n您可能需要直接重新安装这两个包，让 pip 自动为您计算适合您平台的正确版本，例如：\n\n- `pip uninstall torch onnxruntime`\n- `pip cache purge`\n- `pip install --force-reinstall torch torchvision torchaudio`\n- `pip install --force-reinstall onnxruntime-gpu`\n\n我通常建议使用此处向导推荐的命令，为您的环境安装最新版本的 PyTorch：\n\u003Chttps:\u002F\u002Fpytorch.org\u002Fget-started\u002Flocally\u002F>\n\n### 可能需要多个 CUDA 库版本\n\n根据您的 CUDA 版本和环境，您可能需要安装特定版本的 CUDA 库，以便 ONNX Runtime 能够使用您的 GPU。\n\n🧪 例如，Google Colab 现在默认使用 CUDA 12，但 ONNX Runtime 仍然需要 CUDA 11 的库才能正常工作。\n\n如果您在运行 `audio-separator` 时看到错误信息 `Failed to load library` 或 `cannot open shared object file`，这很可能是问题所在。\n\n您可以将 CUDA 11 的库与 CUDA 12 一起安装，方法如下：\n```sh\napt update; apt install nvidia-cuda-toolkit\n```\n\n如果您在 Google Colab 或其他环境中运行时遇到以下消息：\n```\n[E:onnxruntime:Default, provider_bridge_ort.cc:1862 TryGetProviderInfo_CUDA] \u002Fonnxruntime_src\u002Fonnxruntime\u002Fcore\u002Fsession\u002Fprovider_bridge_ort.cc:1539 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : 加载库 libonnxruntime_providers_cuda.so 失败，错误为：libcudnn_adv.so.9: 无法打开共享对象文件：没有这样的文件或目录\n\n[W:onnxruntime:Default, onnxruntime_pybind_state.cc:993 CreateExecutionProviderInstance] 创建 CUDAExecutionProvider 失败。需要 cuDNN 9.* 和 CUDA 12.*。请按照 GPU 要求页面（https:\u002F\u002Fonnxruntime.ai\u002Fdocs\u002Fexecution-providers\u002FCUDA-ExecutionProvider.html#requirements）中的说明安装所有依赖项，确保它们位于 PATH 中，并且您的 GPU 是受支持的。\n```\n您可以通过运行以下命令来解决这个问题：\n```sh\npython -m pip install ort-nightly-gpu --index-url=https:\u002F\u002Faiinfra.pkgs.visualstudio.com\u002FPublicPackages\u002F_packaging\u002Fort-cuda-12-nightly\u002Fpypi\u002Fsimple\u002F\n```\n\n> 注：如果有人知道如何使这一过程更加简洁，从而让我们能够在不为每种硬件加速需求单独安装依赖的情况下支持不同平台的特定依赖，请告知我们或提交一个 PR！\n\n## 使用方法 🚀\n\n### 命令行界面 (CLI)\n\n您可以通过命令行使用 Audio Separator，例如：\n\n```sh\naudio-separator \u002Fpath\u002Fto\u002Fyour\u002Finput\u002Faudio.wav --model_filename model_bs_roformer_ep_317_sdr_12.9755.ckpt\n```\n\n此命令将下载指定的模型文件，处理输入音频 `audio.wav`，并在当前目录下生成两个新文件，分别包含人声和伴奏。\n\n**注意：** 您无需自行下载任何文件——audio-separator 会自动为您完成！\n\n要查看支持的模型列表，请运行 `audio-separator --list_models`。\n\n模型列表输出中列出的任何文件（包括文件扩展名）都可以通过 `model_filename` 参数指定（例如 `--model_filename UVR_MDXNET_KARA_2.onnx`），它将在首次使用时自动下载到 `--model_file_dir` 目录（默认为 `\u002Ftmp\u002Faudio-separator-models\u002F`）。\n\n### 列出并筛选可用模型\n\n你可以使用 `--list_models`（或 `-l`）标志查看所有可用模型：\n\n```sh\naudio-separator --list_models\n```\n\n输出会显示一个表格，包含以下列：\n- 模型文件名：用于 `--model_filename` 的文件名\n- 架构：模型架构（MDX、MDXC、Demucs 等）\n- 输出音轨（SDR）：该模型可以分离的音轨，以及在有数据时提供的信噪比分数\n- 友好名称：描述模型的人类可读名称\n\n#### 筛选模型\n\n你可以使用 `--list_filter` 按音轨类型筛选和排序模型列表。例如，要查找可以分离鼓声的模型：\n\n```sh\naudio-separator -l --list_filter=drums\n```\n\n示例输出：\n```\n-----------------------------------------------------------------------------------------------------------------------------------\nModel Filename        Arch    Output Stems (SDR)                                            Friendly Name\n-----------------------------------------------------------------------------------------------------------------------------------\nhtdemucs_ft.yaml      Demucs  vocals (10.8), drums (10.1), bass (11.9), other               Demucs v4: htdemucs_ft\nhdemucs_mmi.yaml      Demucs  vocals (10.3), drums (9.7), bass (12.0), other                Demucs v4: hdemucs_mmi\nhtdemucs.yaml         Demucs  vocals (10.0), drums (9.4), bass (11.3), other                Demucs v4: htdemucs\nhtdemucs_6s.yaml      Demucs  vocals (9.7), drums (8.5), bass (10.0), guitar, piano, other  Demucs v4: htdemucs_6s\n```\n\n#### 限制结果数量\n\n你可以使用 `--list_limit` 来限制显示的结果数量。这对于查找特定音轨的最佳模型非常有用。例如，要查看前 5 名人声分离模型：\n\n```sh\naudio-separator -l --list_filter=vocals --list_limit=5\n```\n\n示例输出：\n```\n--------------------------------------------------------------------------------------------------------------------------------------------------------------\nModel Filename                             Arch  Output Stems (SDR)                   Friendly Name\n--------------------------------------------------------------------------------------------------------------------------------------------------------------\nmodel_bs_roformer_ep_317_sdr_12.9755.ckpt  MDXC  vocals* (12.9), instrumental (17.0)  Roformer Model: BS-Roformer-Viperx-1297\nmodel_bs_roformer_ep_368_sdr_12.9628.ckpt  MDXC  vocals* (12.9), instrumental (17.0)  Roformer Model: BS-Roformer-Viperx-1296\nvocals_mel_band_roformer.ckpt              MDXC  vocals* (12.6), other                Roformer Model: MelBand Roformer | Vocals by Kimberley Jensen\nmelband_roformer_big_beta4.ckpt            MDXC  vocals* (12.5), other                Roformer Model: MelBand Roformer Kim | Big Beta 4 FT by unwa\nmel_band_roformer_kim_ft_unwa.ckpt         MDXC  vocals* (12.4), other                Roformer Model: MelBand Roformer Kim | FT by unwa\n```\n\n#### JSON 输出\n\n对于程序化使用，你可以将模型列表以 JSON 格式输出：\n\n```sh\naudio-separator -l --list_format=json\n```\n\n### 处理大文件\n\n对于非常长的音频文件（超过 1 小时），你可能会遇到内存不足的错误。`--chunk_duration` 选项会自动将大文件分割成较小的块，分别处理后再合并结果：\n\n```sh\n# 将 8 小时的播客按 10 分钟为单位处理\naudio-separator long_podcast.wav --chunk_duration 600\n\n# 根据可用内存调整分块大小\naudio-separator very_long_audio.wav --chunk_duration 300  # 5 分钟为单位\n```\n\n#### 工作原理\n\n1. **分割**：输入文件被分割成固定时长的块（例如 10 分钟）\n2. **处理**：每个块单独处理，从而降低峰值内存使用量\n3. **合并**：结果通过简单的拼接方式重新合并在一起\n\n分块功能支持所有类型的模型：\n- **2 音轨模型**（如 MDX）：人声 + 伴奏\n- **4 音轨模型**（如 Demucs）：鼓、贝斯、其他、人声\n- **6 音轨模型**（如 Demucs 6s）：贝斯、鼓、其他、人声、吉他、钢琴\n\n#### 优点\n\n- **防止 OOM 错误**：可以处理任意长度的文件而不会耗尽内存\n- **内存使用可控**：无论文件多长，内存使用都保持在一定范围内\n- **无质量损失**：每个块都使用选定的模型完整处理\n- **多音轨支持**：无缝支持 2、4 和 6 音轨模型\n\n#### 建议\n\n- **文件大于 1 小时**：使用 `--chunk_duration 600`（10 分钟）\n- **内存有限的系统**：使用更小的分块（300–600 秒）\n- **内存充足时**：可能根本不需要分块\n\n#### 关于音频质量的说明\n\n分块之间是直接拼接的，没有交叉淡化处理，这在极少数情况下可能会导致分块边界处出现轻微的伪影。不过，在大多数情况下，这些伪影并不明显。这种简单的拼接方式能够在解决内存问题的同时，尽量减少处理时间。\n\n### 组合多个模型\n\n你可以组合多个模型的结果来提高分离质量。这将依次运行每个模型，然后使用指定的算法合并它们的输出。\n\n#### CLI 使用方法\n\n使用 `-m` 指定主模型，用 `--extra_models` 指定额外的模型。你还可以通过 `--ensemble_algorithm` 指定集成算法。\n\n```sh\n# 使用默认的 'avg_wave' 算法组合两个模型\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx\n\n# 使用特定算法组合多个模型\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx model3.ckpt --ensemble_algorithm max_fft\n\n# 使用自定义权重（必须与模型数量一致）\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx --ensemble_weights 2.0 1.0\n```\n\n#### Python API 使用方法\n\n```python\nfrom audio_separator.separator import Separator\n\n# 初始化 Separator 类并设置自定义参数\nseparator = Separator(\n    output_dir='output',\n    ensemble_algorithm='avg_wave'\n)\n\n# 要组合的模型列表\n# 注意：如果这些模型不存在，将会自动下载\nmodels = [\n    'UVR-MDX-NET-Inst_HQ_3.onnx',\n    'UVR_MDXNET_KARA_2.onnx'\n]\n\n# 指定多个模型进行集成\nseparator.load_model(model_filename=models)\n\n# 执行分离\noutput_files = separator.separate('audio.wav')\n```\n\n#### 支持的集成算法\n- `avg_wave`：波形加权平均（默认）\n- `median_wave`：波形中位数\n- `min_wave`：波形最小值\n- `max_wave`：波形最大值\n- `avg_fft`：频谱图加权平均\n- `median_fft`：频谱图中位数\n- `min_fft`：频谱图最小值\n- `max_fft`：频谱图最大值\n- `uvr_max_spec`：基于 UVR 的最大频谱图集成\n- `uvr_min_spec`：基于 UVR 的最小频谱图集成\n- `ensemble_wav`：基于 UVR 的噪声最小块集成\n\n#### 集成预设\n\n除了手动指定模型和算法外，你还可以使用社区测试过的精选组合预设：\n\n```sh\naudio-separator audio.wav -m model1.ckpt --extra_models model2.onnx --ensemble_algorithm uvr_max_spec\n```\n\n# 列出可用的预设\naudio-separator --list_presets\n\n# 使用一个预设（模型和算法会自动配置）\naudio-separator audio.wav --ensemble_preset vocal_balanced\n\n# 覆盖预设的算法\naudio-separator audio.wav --ensemble_preset vocal_balanced --ensemble_algorithm max_fft\n```\n\n**Python API：**\n```python\nseparator = Separator(output_dir='output', ensemble_preset='vocal_balanced')\nseparator.load_model()  # 自动使用预设中的模型\noutput_files = separator.separate('audio.wav')\n```\n\n可用的预设：\n\n| 预设 | 使用场景 | 模型 | 算法 |\n|--------|----------|--------|-----------|\n| `instrumental_clean` | 最干净的纯音乐，人声串音最少 | 2 | `uvr_max_spec` |\n| `instrumental_full` | 最大限度保留乐器音色 | 2 | `uvr_max_spec` |\n| `instrumental_balanced` | 噪音与音色饱满度的良好平衡 | 2 | `uvr_max_spec` |\n| `instrumental_low_resource` | 速度快、显存占用低 | 2 | `avg_fft` |\n| `vocal_balanced` | 整体人声质量最佳 | 2 | `avg_fft` |\n| `vocal_clean` | 乐器串音最小 | 2 | `min_fft` |\n| `vocal_full` | 最大化捕捉人声 | 2 | `max_fft` |\n| `vocal_rvc` | 针对RVC\u002FAI训练优化 | 2 | `avg_wave` |\n| `karaoke` | 主唱去除 | 3 | `avg_wave` |\n\n预设定义在 `audio_separator\u002Fensemble_presets.json` 文件中——欢迎通过PR贡献！\n\n### 完整命令行界面选项\n\n```sh\n用法：audio-separator [-h] [-v] [-d] [-e] [-l] [--log_level LOG_LEVEL] [--list_filter LIST_FILTER] [--list_limit LIST_LIMIT] [--list_format {pretty,json}] [-m MODEL_FILENAME] [--output_format OUTPUT_FORMAT]\n                       [--output_bitrate OUTPUT_BITRATE] [--output_dir OUTPUT_DIR] [--model_file_dir MODEL_FILE_DIR] [--download_model_only] [--invert_spect] [--normalization NORMALIZATION]\n                       [--amplification AMPLIFICATION] [--single_stem SINGLE_STEM] [--sample_rate SAMPLE_RATE] [--use_soundfile] [--use_autocast] [--custom_output_names CUSTOM_OUTPUT_NAMES]\n                       [--mdx_segment_size MDX_SEGMENT_SIZE] [--mdx_overlap MDX_OVERLAP] [--mdx_batch_size MDX_BATCH_SIZE] [--mdx_hop_length MDX_HOP_LENGTH] [--mdx_enable_denoise] [--vr_batch_size VR_BATCH_SIZE]\n                       [--vr_window_size VR_WINDOW_SIZE] [--vr_aggression VR_AGGRESSION] [--vr_enable_tta] [--vr_high_end_process] [--vr_enable_post_process]\n                       [--vr_post_process_threshold VR_POST_PROCESS_THRESHOLD] [--demucs_segment_size DEMUCS_SEGMENT_SIZE] [--demucs_shifts DEMUCS_SHIFTS] [--demucs_overlap DEMUCS_OVERLAP]\n                       [--demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED] [--mdxc_segment_size MDXC_SEGMENT_SIZE] [--mdxc_override_model_segment_size] [--mdxc_overlap MDXC_OVERLAP]\n                       [--mdxc_batch_size MDXC_BATCH_SIZE] [--mdxc_pitch_shift MDXC_PITCH_SHIFT]\n                       [audio_files ...]\n\n将音频文件分离成不同的音轨。\n\n位置参数：\n  audio_files                                            要分离的音频文件路径或目录，支持任何常见格式。\n\n选项：\n  -h, --help                                             显示此帮助信息并退出。\n\n信息与调试：\n  -v, --version                                          显示程序版本号并退出。\n  -d, --debug                                            启用调试日志记录，等同于 --log_level=debug。\n  -e, --env_info                                         打印环境信息并退出。\n  -l, --list_models                                      列出所有支持的模型并退出。可使用 --list_filter 对列表进行筛选\u002F排序，使用 --list_limit 限制显示结果的数量。\n  --log_level LOG_LEVEL                                  日志级别，例如 info、debug、warning（默认为 info）。\n  --list_filter LIST_FILTER                              根据名称、文件名或特定音轨（如人声、伴奏、鼓等）筛选和排序模型列表。\n  --list_limit LIST_LIMIT                                限制显示的模型数量。\n  --list_format {pretty,json}                            列表输出格式：'pretty' 用于格式化输出，'json' 用于原始 JSON 输出。\n\n分离输入输出参数：\n  -m MODEL_FILENAME, --model_filename MODEL_FILENAME     用于分离的模型文件名（默认：model_bs_roformer_ep_317_sdr_12.9755.yaml）。示例：-m 2_HP-UVR.pth\n  --output_format OUTPUT_FORMAT                          分离后文件的输出格式，支持任何常见格式（默认：FLAC）。示例：--output_format=MP3\n  --output_bitrate OUTPUT_BITRATE                        分离后文件的输出比特率，支持任何 ffmpeg 兼容的比特率（默认：无）。示例：--output_bitrate=320k\n  --output_dir OUTPUT_DIR                                输出文件保存目录（默认：当前目录）。示例：--output_dir=\u002Fapp\u002Fseparated\n  --model_file_dir MODEL_FILE_DIR                        模型文件存储目录（默认：\u002Ftmp\u002Faudio-separator-models\u002F）。示例：--model_file_dir=\u002Fapp\u002Fmodels\n  --download_model_only                                  仅下载单个模型文件，不执行分离操作。\n\n常用分离参数：\n  --invert_spect                                         使用频谱图反转次要音轨（默认：否）。示例：--invert_spect\n  --normalization NORMALIZATION                          输入和输出音频的最大峰值振幅归一化值（默认：0.9）。示例：--normalization=0.7\n  --amplification AMPLIFICATION                          输入和输出音频的最小峰值振幅放大值（默认：0.0）。示例：--amplification=0.4\n  --single_stem SINGLE_STEM                              只输出单一音轨，例如伴奏、人声、鼓、贝斯、吉他、钢琴或其他。示例：--single_stem=伴奏\n  --sample_rate SAMPLE_RATE                              修改输出音频的采样率（默认：44100）。示例：--sample_rate=44100\n  --use_soundfile                                        使用 soundfile 库写入音频输出（默认：否）。示例：--use_soundfile\n  --use_autocast                                         使用 PyTorch 的 autocast 加速推理（默认：否）。请勿在 CPU 上使用。示例：--use_autocast\n  --custom_output_names CUSTOM_OUTPUT_NAMES              以 JSON 格式自定义所有输出文件的名称（默认：无）。示例：--custom_output_names='{\"Vocals\": \"vocals_output\", \"Drums\": \"drums_output\"}'\n\nMDX 架构参数：\n  --mdx_segment_size MDX_SEGMENT_SIZE                    值越大，资源消耗越多，但可能获得更好的效果（默认值：256）。示例：--mdx_segment_size=256\n  --mdx_overlap MDX_OVERLAP                              预测窗口之间的重叠程度，范围为0.001至0.999。数值越高越好，但速度越慢（默认值：0.25）。示例：--mdx_overlap=0.25\n  --mdx_batch_size MDX_BATCH_SIZE                        值越大，占用的内存越多，但处理速度可能会稍快（默认值：1）。示例：--mdx_batch_size=4\n  --mdx_hop_length MDX_HOP_LENGTH                        在神经网络中通常称为步幅，只有在了解其作用时才应更改（默认值：1024）。示例：--mdx_hop_length=1024\n  --mdx_enable_denoise                                   在分离过程中启用去噪功能（默认值：False）。示例：--mdx_enable_denoise\n\nVR 架构参数：\n  --vr_batch_size VR_BATCH_SIZE                          每次处理的批次数量。数值越高，占用的内存越多，处理速度也会稍快（默认值：1）。示例：--vr_batch_size=16\n  --vr_window_size VR_WINDOW_SIZE                        平衡质量和速度。1024表示速度快但质量较低，320则速度较慢但质量较高。（默认值：512）。示例：--vr_window_size=320\n  --vr_aggression VR_AGGRESSION                          主要音轨提取的强度，范围为-100至100。通常人声和伴奏设置为5（默认值：5）。示例：--vr_aggression=2\n  --vr_enable_tta                                        启用测试时增强；虽然速度较慢，但可以提高质量（默认值：False）。示例：--vr_enable_tta\n  --vr_high_end_process                                  镜像输出中缺失的频率范围（默认值：False）。示例：--vr_high_end_process\n  --vr_enable_post_process                               识别人声输出中的残留伪影；对某些歌曲可能改善分离效果（默认值：False）。示例：--vr_enable_post_process\n  --vr_post_process_threshold VR_POST_PROCESS_THRESHOLD  后处理功能的阈值：0.1至0.3（默认值：0.2）。示例：--vr_post_process_threshold=0.1\n\nDemucs 架构参数：\n  --demucs_segment_size DEMUCS_SEGMENT_SIZE              音频被分割成的片段大小，范围为1至100。数值越高，速度越慢，但质量越好（默认值：默认值）。示例：--demucs_segment_size=256\n  --demucs_shifts DEMUCS_SHIFTS                          使用随机偏移进行预测的次数，数值越高，速度越慢，但质量越好（默认值：2）。示例：--demucs_shifts=4\n  --demucs_overlap DEMUCS_OVERLAP                        预测窗口之间的重叠程度，范围为0.001至0.999。数值越高，速度越慢，但质量越好（默认值：0.25）。示例：--demucs_overlap=0.25\n  --demucs_segments_enabled DEMUCS_SEGMENTS_ENABLED      启用分段处理功能（默认值：True）。示例：--demucs_segments_enabled=False\n\nMDXC 架构参数：\n  --mdxc_segment_size MDXC_SEGMENT_SIZE                  值越大，资源消耗越多，但可能获得更好的结果（默认值：256）。示例：--mdxc_segment_size=256\n  --mdxc_override_model_segment_size                     覆盖模型的默认片段大小，而不使用模型的默认值。示例：--mdxc_override_model_segment_size\n  --mdxc_overlap MDXC_OVERLAP                            预测窗口之间的重叠量，范围为2至50。数值越高越好，但速度越慢（默认值：8）。示例：--mdxc_overlap=8\n  --mdxc_batch_size MDXC_BATCH_SIZE                      值越大，占用的内存越多，但处理速度可能会稍快（默认值：1）。示例：--mdxc_batch_size=4\n  --mdxc_pitch_shift MDXC_PITCH_SHIFT                    在处理过程中将音频音高移动若干半音。这可能改善低沉或高亢人声的输出。（默认值：0）。示例：--mdxc_pitch_shift=2\n```\n\n\n\n### 作为 Python 项目的依赖项\n\n您可以在自己的 Python 项目中使用 Audio Separator。以下是一个使用默认双音轨（伴奏和人声）模型的最小示例：\n\n```python\nfrom audio_separator.separator import Separator\n\n# 初始化 Separator 类（可选配置属性如下）\nseparator = Separator()\n\n# 加载机器学习模型（未指定时，默认加载 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt'）\nseparator.load_model()\n\n# 对特定音频文件执行分离操作，无需重新加载模型\noutput_files = separator.separate('audio1.wav')\n\nprint(f\"分离完成！输出文件：{' '.join(output_files)}\")\n```\n\n#### 批量处理与多模型处理\n\n您可以一次性处理多个文件，而无需重新加载模型，从而节省时间和内存。\n\n仅在选择或更换模型时才需要加载模型。示例如下：\n\n```python\nfrom audio_separator.separator import Separator\n\n# 初始化 Separator 类（可选配置属性如下）\nseparator = Separator()\n\n# 加载一个模型\nseparator.load_model(model_filename='model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n\n# 分离多个音频文件，无需重新加载模型\noutput_files = separator.separate(['audio1.wav', 'audio2.wav', 'audio3.wav'])\n\n# 加载另一个模型\nseparator.load_model(model_filename='UVR_MDXNET_KARA_2.onnx')\n\n# 使用新模型再次分离相同的文件\noutput_files = separator.separate(['audio1.wav', 'audio2.wav', 'audio3.wav'])\n```\n\n您也可以指定包含音频文件的文件夹路径，而不是逐一列出每个文件的完整路径：\n```python\nfrom audio_separator.separator import Separator\n\n# 初始化 Separator 类（可选配置属性如下）\nseparator = Separator()\n\n# 加载一个模型\nseparator.load_model(model_filename='model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n\n# 分离位于文件夹中的所有音频文件\noutput_files = separator.separate('path\u002Fto\u002Faudio_directory')\n```\n\n#### 重命名音轨\n\n您可以通过指定所需名称来重命名输出文件。例如：\n```python\noutput_names = {\n    \"Vocals\": \"vocals_output\",\n    \"Instrumental\": \"instrumental_output\",\n}\noutput_files = separator.separate('audio1.wav', output_names)\n```\n在这种情况下，输出文件名将是：`vocals_output.wav` 和 `instrumental_output.wav`。\n\n您还可以单独重命名特定的音轨：\n\n- 要重命名人声音轨：\n  ```python\n  output_names = {\n      \"Vocals\": \"vocals_output\",\n  }\n  output_files = separator.separate('audio1.wav', output_names)\n  ```\n  > 输出文件将被命名为：`vocals_output.wav` 和 `audio1_(Instrumental)_model_mel_band_roformer_ep_3005_sdr_11.wav`\n- 要重命名伴奏音轨：\n  ```python\n  output_names = {\n      \"Instrumental\": \"instrumental_output\",\n  }\n  output_files = separator.separate('audio1.wav', output_names)\n  ```\n  > 输出文件将被命名为：`audio1_(Vocals)_model_mel_band_roformer_ep_3005_sdr_11.wav` 和 `instrumental_output.wav`\n- Demucs 模型的音轨列表：\n  - htdemucs_6s.yaml\n    ```python\n    output_names = {\n        \"Vocals\": \"vocals_output\",\n        \"Drums\": \"drums_output\",\n        \"Bass\": \"bass_output\",\n        \"Other\": \"other_output\",\n        \"Guitar\": \"guitar_output\",\n        \"Piano\": \"piano_output\",\n    }\n    ```\n  - 其他 Demucs 模型\n    ```python\n    output_names = {\n        \"Vocals\": \"vocals_output\",\n        \"Drums\": \"drums_output\",\n        \"Bass\": \"bass_output\",\n        \"Other\": \"other_output\",\n    }\n    ```\n\n## Separator 类的参数\n\n- **`log_level`:**（可选）日志级别，例如 INFO、DEBUG、WARNING。`默认值：logging.INFO`\n- **`log_formatter`:**（可选）日志格式。默认值：None，回退到 '%(asctime)s - %(levelname)s - %(module)s - %(message)s'\n- **`model_file_dir`:**（可选）用于缓存模型文件的目录。`默认值：\u002Ftmp\u002Faudio-separator-models\u002F`\n- **`output_dir`:**（可选）分离后的文件将保存的目录。如果未指定，则使用当前目录。\n- **`output_format`:**（可选）用于编码输出文件的格式，任何常见格式（WAV、MP3、FLAC、M4A 等）。`默认值：WAV`\n- **`normalization_threshold`:**（可选）输出音频的振幅将被乘以的倍数。`默认值：0.9`\n- **`amplification_threshold`:**（可选）波形将被放大的最小振幅水平。如果音频的峰值振幅低于此阈值，波形将被缩放以达到该水平。`默认值：0.0`\n- **`output_single_stem`:**（可选）仅输出单个音轨，例如“Instrumental”和“Vocals”。`默认值：None`\n- **`invert_using_spec`:**（可选）使用频谱图进行反转的标志。`默认值：False`\n- **`sample_rate`:**（可选）设置输出音频的采样率。`默认值：44100`\n- **`use_soundfile`:**（可选）使用 soundfile 进行输出写入，可以解决 OOM 问题，尤其是在较长的音频上。\n- **`use_autocast`:**（可选）使用 PyTorch autocast 进行更快的推理的标志。请勿在 CPU 推理时使用。`默认值：False`\n- **`mdx_params`:**（可选）MDX 架构特定属性及默认值。`默认值：{\"hop_length\": 1024, \"segment_size\": 256, \"overlap\": 0.25, \"batch_size\": 1, \"enable_denoise\": False}`\n- **`vr_params`:**（可选）VR 架构特定属性及默认值。`默认值：{\"batch_size\": 1, \"window_size\": 512, \"aggression\": 5, \"enable_tta\": False, \"enable_post_process\": False, \"post_process_threshold\": 0.2, \"high_end_process\": False}`\n- **`demucs_params`:**（可选）Demucs 架构特定属性及默认值。`默认值：{\"segment_size\": \"Default\", \"shifts\": 2, \"overlap\": 0.25, \"segments_enabled\": True}` _(注：`segment_size` 的“Default”使用模型内部默认值，通常旧版 Demucs 模型为 40，而 Demucs v4\u002Fhtdemucs 为 10)_\n- **`mdxc_params`:**（可选）MDXC 架构特定属性及默认值。`默认值：{\"segment_size\": 256, \"override_model_segment_size\": False, \"batch_size\": 1, \"overlap\": 8, \"pitch_shift\": 0}`\n- **`ensemble_algorithm`:**（可选）用于集成多个模型的算法。`默认值：'avg_wave'`\n- **`ensemble_weights`:**（可选）集成中每个模型的权重。`默认值：None`（等权重）\n- **`ensemble_preset`:**（可选）命名的集成预设（例如 `'vocal_balanced'`、`'karaoke'`）。会自动设置模型、算法和权重。使用 `Separator(info_only=True).list_ensemble_presets()` 查看所有预设。`默认值：None`\n\n## 远程 API 使用 🌐\n\nAudio Separator 包含一个远程 API 客户端，允许您连接到已部署的 Audio Separator API 服务，从而无需在本地运行模型即可执行音频分离。该 API 使用异步处理和作业轮询，以高效处理分离任务。\n\n要在 modal.com 上将 Audio Separator 部署为 API 并用于远程处理，请参阅此处的详细文档：[audio_separator\u002Fremote\u002FREADME.md](audio_separator\u002Fremote\u002FREADME.md)。\n\n## 要求 📋\n\nPython >= 3.10\n\n库：torch、onnx、onnxruntime、numpy、librosa、requests、six、tqdm、pydub\n\n## 本地开发\n\n该项目使用 Poetry 进行依赖管理和打包。请按照以下步骤设置本地开发环境：\n\n### 先决条件\n\n- 确保您的机器上已安装 Python 3.10 或更高版本。\n- 安装 Conda（推荐 Miniforge：[Miniforge GitHub](https:\u002F\u002Fgithub.com\u002Fconda-forge\u002Fminiforge)）来管理您的 Python 虚拟环境。\n\n### 克隆仓库\n\n将仓库克隆到您的本地机器：\n\n```sh\ngit clone https:\u002F\u002Fgithub.com\u002FYOUR_USERNAME\u002Faudio-separator.git\ncd audio-separator\n```\n\n如果您已 fork 了该仓库，请将 `YOUR_USERNAME` 替换为您自己的 GitHub 用户名；如果您拥有权限，则使用主仓库 URL。\n\n### 创建并激活 Conda 环境\n\n使用以下命令创建并激活 Conda 环境：\n\n```sh\nconda env create\nconda activate audio-separator-dev\n```\n\n### 安装依赖项\n\n进入 Conda 环境后，运行以下命令安装项目依赖项：\n\n```sh\npoetry install\n```\n\n根据您是使用 GPU 还是 CPU 运行，安装额外的依赖项。\n```sh\npoetry install --extras \"cpu\"\n```\n或\n```sh\npoetry install --extras \"gpu\"\n```\n或\n```sh\npoetry install --extras \"dml\"\n```\n\n### 在本地运行命令行界面\n\n您可以在虚拟环境中直接运行 CLI 命令。例如：\n\n```sh\naudio-separator path\u002Fto\u002Fyour\u002Faudio-file.wav\n```\n\n### 退出虚拟环境\n\n完成开发工作后，只需输入以下命令即可退出虚拟环境：\n\n```sh\nconda deactivate\n```\n\n### 构建软件包\n\n要构建用于分发的软件包，请使用以下命令：\n\n```sh\npoetry build\n```\n\n这将在 `dist` 目录中生成分发包——但目前只有 @beveradb 能够将其发布到 PyPI。\n\n\n## 贡献 🤝\n\n非常欢迎各位贡献！请先 fork 本仓库，然后提交包含您更改的 pull request，我会尽快审查、合并并发布！\n\n- 本项目是 100% 开源的，任何人都可以自由使用和修改。\n- 如果维护本仓库的工作量对我来说变得过大，我可能会寻求志愿者共同承担维护工作，不过这种情况不太可能发生。\n- MDX-Net 分离模型的开发与支持属于主 [UVR 项目](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui)，本仓库只是为这些模型提供一个 CLI\u002FPython 封装，以便更方便地以编程方式运行它们。因此，如果您希望尝试改进这些模型本身，请参与 UVR 项目，并在其中寻找相关指导！\n\n## 许可证 📄\n\n本项目采用 MIT [许可证](LICENSE)。\n\n- **请注意：** 如果您选择将本项目集成到其他项目中，并且使用默认模型或作为 [UVR](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui) 项目一部分训练的任何其他模型，请务必遵守 MIT 许可证的要求，向 UVR 及其开发者致谢！\n\n## 致谢 🙏\n\n- [Anjok07](https:\u002F\u002Fgithub.com\u002FAnjok07) - [Ultimate Vocal Remover GUI](https:\u002F\u002Fgithub.com\u002FAnjok07\u002Fultimatevocalremovergui) 的作者，本仓库中的几乎所有代码都源自该GUI！本项目的任何成就都应归功于他。感谢！\n- [DilanBoskan](https:\u002F\u002Fgithub.com\u002FDilanBoskan) - 您在本项目初期所做的贡献对 UVR 的成功至关重要。感谢！\n- [Kuielab & Woosung Choi](https:\u002F\u002Fgithub.com\u002Fkuielab) - 开发了最初的 MDX-Net AI 代码。\n- [KimberleyJSN](https:\u002F\u002Fgithub.com\u002FKimberleyJensen) - 为 MDX-Net 和 Demucs 的训练脚本实现提供了建议和支持。感谢！\n- [Hv](https:\u002F\u002Fgithub.com\u002FNaJeongMo\u002FColab-for-MDX_B) - 帮助将分块处理功能集成到 MDX-Net AI 代码中。感谢！\n- [zhzhongshi](https:\u002F\u002Fgithub.com\u002Fzhzhongshi) - 帮助在 `audio-separator` 中添加对 MDXC 模型的支持。感谢！\n\n## 联系方式 💌\n\n如有任何问题或反馈，请提交 issue，或直接联系 @beveradb（Andrew Beveridge，邮箱：andrew@beveridge.uk）。\n\n---\n\u003Cdiv align=\"center\">\n\n\u003C!-- sponsors -->\u003C!-- sponsors -->\n\n## 感谢所有贡献者的辛勤付出\n\n\u003Ca href=\"https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fgraphs\u002Fcontributors\">\n  \u003Cimg src=\"https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fnomadkaraoke_python-audio-separator_readme_a77de88dc878.png\" \u002F>\n\u003C\u002Fa>\n\n\u003C\u002Fdiv>","# python-audio-separator 快速上手指南\n\n**Audio Separator** 是一个强大的 Python 工具，用于将音频文件分离为不同的人声和乐器音轨（Stems）。它基于 UVR (Ultimate Vocal Remover) 社区训练的 MDX-Net、VR Arch、Demucs 等模型，非常适合制作卡拉 OK、音乐重混或音频降噪。\n\n## 环境准备\n\n### 系统要求\n- **操作系统**: Linux, macOS (支持 Apple Silicon M1\u002FM2\u002FM3), Windows\n- **Python 版本**: 3.8 - 3.11\n- **硬件加速** (可选但推荐):\n  - **NVIDIA GPU**: 需要支持 CUDA (版本 11.8 或 12.2)\n  - **Apple Silicon**: macOS Sonoma 及以上版本 (支持 CoreML 加速)\n  - **CPU**: 所有平台均支持，但速度较慢\n\n### 前置依赖\n无论采用何种安装方式，都需要确保系统中已安装 **FFmpeg**。\n\n- **Debian\u002FUbuntu**:\n  ```bash\n  apt-get update && apt-get install -y ffmpeg\n  ```\n- **macOS**:\n  ```bash\n  brew update && brew install ffmpeg\n  ```\n- **Windows**:\n  请从 [ffmpeg.org](https:\u002F\u002Fffmpeg.org\u002Fdownload.html) 下载并配置环境变量，或使用 `choco install ffmpeg`。\n\n---\n\n## 安装步骤\n\n根据你的硬件环境选择以下一种安装方式：\n\n### 1. 使用 Docker (最推荐，环境隔离最好)\n无需配置本地依赖，直接拉取镜像运行。\n\n- **CPU 版本**:\n  ```bash\n  docker run -it -v $(pwd):\u002Fworkdir beveradb\u002Faudio-separator input.wav\n  ```\n- **GPU 版本 (NVIDIA)**:\n  ```bash\n  docker run -it --gpus all -v $(pwd):\u002Fworkdir beveradb\u002Faudio-separator:gpu input.wav\n  ```\n*(注：请将 `input.wav` 替换为你实际的文件名)*\n\n### 2. 使用 Pip 安装 (Python 项目集成)\n\n#### 方案 A: NVIDIA GPU (CUDA 加速)\n```bash\npip install \"audio-separator[gpu]\"\n```\n*如果遇到 CUDA 库冲突，可能需要手动重装 torch 和 onnxruntime-gpu。*\n\n#### 方案 B: Apple Silicon (CoreML 加速) 或 纯 CPU\n```bash\npip install \"audio-separator[cpu]\"\n```\n\n### 3. 使用 Conda 安装\n```bash\n# GPU 版本\nconda install pytorch=*cuda* onnxruntime=*cuda* audio-separator -c pytorch -c conda-forge\n\n# CPU 版本\nconda install audio-separator -c pytorch -c conda-forge\n```\n\n> **国内加速提示**: 如果 pip 或 conda 下载缓慢，建议配置国内镜像源。\n> - Pip: `pip install ... -i https:\u002F\u002Fpypi.tuna.tsinghua.edu.cn\u002Fsimple`\n> - Conda: `conda config --add channels https:\u002F\u002Fmirrors.tuna.tsinghua.edu.cn\u002Fanaconda\u002Fpkgs\u002Fmain\u002F`\n\n---\n\n## 基本使用\n\n安装完成后，即可通过命令行直接使用。工具会自动下载所需的模型文件，无需手动干预。\n\n### 1. 查看可用模型\n在分离音频前，可以先查看支持的模型列表：\n```bash\naudio-separator --list_models\n```\n你可以使用 `--list_filter=drums` 等参数过滤特定类型的模型（如鼓点、人声等）。\n\n### 2. 执行音频分离 (最简单示例)\n将音频文件分离为人声 (Vocals) 和伴奏 (Instrumental)：\n\n```bash\naudio-separator \u002Fpath\u002Fto\u002Fyour\u002Finput.mp3\n```\n\n**说明**:\n- 默认会使用最优模型将音频分为两个音轨。\n- 输出文件将保存在当前目录下。\n- 支持格式：MP3, WAV, FLAC, M4A 等常见格式。\n\n### 3. 指定特定模型\n如果你需要更精细的控制（例如分离鼓点或去除回声），可以指定模型文件名：\n\n```bash\naudio-separator input.wav --model_filename UVR_MDXNET_KARA_2.onnx\n```\n\n### 4. 在 Python 代码中使用\n也可以在 Python 脚本中作为库调用：\n\n```python\nfrom audio_separator.separator import Separator\n\nseparator = Separator()\noutput_files = separator.separate('input.mp3')\n\nprint(f\"分离完成，生成文件：{output_files}\")\n```\n\n---\n\n**提示**: 首次运行时，程序会自动从网络下载模型文件到本地缓存目录（默认为 `\u002Ftmp\u002Faudio-separator-models\u002F`），请确保网络连接畅通。","一家小型音乐工作室需要为独立歌手快速制作伴奏带和分轨素材，以用于短视频推广和现场排练。\n\n### 没有 python-audio-separator 时\n- 工程师必须手动配置复杂的深度学习环境，反复调试 UVR 图形界面依赖，耗时数小时才能运行一次分离任务。\n- 处理批量歌曲时缺乏自动化脚本支持，只能人工逐个导入文件并点击按钮，效率极低且容易出错。\n- 想要提取特定乐器（如仅保留鼓点或贝斯）时，受限于固定功能，难以灵活调用不同的预训练模型进行精细处理。\n- 在服务器端部署时，由于缺少轻量级的 CLI 或 Python 库支持，难以将音频分离功能集成到现有的自动化工作流中。\n\n### 使用 python-audio-separator 后\n- 通过简单的 pip 安装或直接调用命令行，几分钟内即可利用 MDX-Net 等先进模型完成人声与伴奏的高质量分离。\n- 编写几行 Python 代码即可实现文件夹级别的批量处理，自动遍历上百首歌曲并输出分轨文件，释放了大量人力。\n- 灵活指定不同模型参数，轻松从混合音源中单独提取鼓、贝斯、钢琴等特定乐器茎干，满足多样化的创作需求。\n- 无缝集成到后端服务或 Docker 容器中，支持 GPU 加速，让音频处理成为自动化内容生产流水线中的标准一环。\n\npython-audio-separator 将原本繁琐的音频分离过程转化为简单可靠的代码指令，极大降低了音乐技术应用的门槛并提升了生产效率。","https:\u002F\u002Foss.gittoolsai.com\u002Fimages\u002Fnomadkaraoke_python-audio-separator_3cbf080b.png","nomadkaraoke","Nomad Karaoke","https:\u002F\u002Foss.gittoolsai.com\u002Favatars\u002Fnomadkaraoke_7c6e1cec.png","Building a better Karaoke experience for everyone",null,"andrew@nomadkaraoke.com","https:\u002F\u002Fnomadkaraoke.com","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke",[85,89],{"name":86,"color":87,"percentage":88},"Python","#3572A5",97.3,{"name":90,"color":91,"percentage":92},"Shell","#89e051",2.7,1133,182,"2026-04-17T03:42:26","MIT",3,"Linux, macOS, Windows (通过 Docker 或 Pip)","可选。若需加速，推荐 NVIDIA GPU (支持 CUDA 11.8 或 12.2)；Apple Silicon (M1 及更新版本) 支持 CoreML 加速。未说明具体显存大小要求。","未说明",{"notes":102,"python":103,"dependencies":104},"1. 支持多种安装方式：Docker (推荐，含 CPU\u002FGPU 镜像)、Conda、Pip。\n2. GPU 加速需注意 CUDA 版本匹配：NVIDIA 需 CUDA 11.8 或 12.2；在 Google Colab 等默认 CUDA 12 环境中，可能需要额外安装 CUDA 11 库或使用 nightly 版 onnxruntime-gpu。\n3. Apple Silicon 用户需 macOS Sonoma 及以上版本以启用 CoreML 加速。\n4. FFmpeg 是必需依赖，若使用 Pip 在非 Conda\u002FDocker 环境安装，需手动单独安装 FFmpeg。\n5. 模型文件会在首次运行时自动下载，无需手动准备。","未说明 (隐含需要 Python 3 以支持 PyTorch 和 ONNX Runtime)",[105,106,107],"torch","onnxruntime","ffmpeg",[47,14],"2026-03-27T02:49:30.150509","2026-04-18T09:19:22.292552",[112,117,122,127,132,137],{"id":113,"question_zh":114,"answer_zh":115,"source_url":116},38853,"如何提高音频分离的处理速度？","可以通过以下几种方式提升速度：\n1. 尝试使用 VR 架构模型（.pth 文件），例如 `2_HP-UVR.pth`，它们在提供同等质量的同时，计算时间约为 MDX 模型的一半。\n2. 调整 MDX 架构参数以控制推理速度：\n   - `--mdx_segment_size`: 默认 256，较大值消耗更多资源但可能结果更好。\n   - `--mdx_overlap`: 默认 0.25，越高越好但越慢（范围 0.001-0.999）。\n   - `--mdx_batch_size`: 默认 1，增大可消耗更多 RAM 但处理稍快（例如设为 4）。\n3. 将输入音频分割成较短片段，分别处理后重新拼接，或多线程\u002F多容器并行运行 `audio-separator` 进程。","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fissues\u002F32",{"id":118,"question_zh":119,"answer_zh":120,"source_url":121},38854,"为什么 Roformer 模型在 python-audio-separator 中无法使用 GPU，而在 UVR5 中可以？","这是一个已知问题，特定版本的 Roformer 模型可能未正确调用 GPU。维护者指出，使用最新版本的 `audio-separator` 以及特定的 Dockerfile 修改版可以解决此问题。确保您使用的是支持最新 Roformer 模型的版本（如 0.17.1 或更高），并检查 PyTorch 是否正确识别到 CUDA 设备。如果问题依旧，建议参考社区提供的修改版 Docker 配置或在 Mac 上尝试相关的 PR 修复方案。","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fissues\u002F73",{"id":123,"question_zh":124,"answer_zh":125,"source_url":126},38855,"为什么使用相同的模型和配置，python-audio-separator 的分离效果比 UVR5 差（声音发闷或不清晰）？","这是因为 VR 架构模型的默认“攻击性”（aggression）参数设置不同。维护者已在 `audio-separator` 版本 0.17.1 中将 VR 架构的默认 aggression 参数修复为与 UVR5 一致的值。请升级到此版本或更高版本以获得更好的音质。此外，推荐尝试 RoFormer 模型（如 `model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt`），其表现通常优于其他架构。","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fissues\u002F59",{"id":128,"question_zh":129,"answer_zh":130,"source_url":131},38856,"加载 .pth 或 .ckpt 模型时报错（如 KeyError 或 Invalid buffer size）怎么办？","这类错误通常与环境配置或模型元数据解析有关。\n1. 对于 `KeyError: 'primary_stem'`：这通常发生在某些 MDX23C .ckpt 模型上，可能是模型文件格式解析问题，建议更新到最新版本（0.18.3+），该版本修复了对 Demucs 和部分 CKPT 模型的支持。\n2. 对于 `RuntimeError: Invalid buffer size`：这在 macOS (Intel\u002FApple Silicon) 上较常见，可能与内存分配有关。尝试设置环境变量 `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0`（针对 Apple Silicon）或检查系统可用内存。如果 `.onnx` 模型能正常工作，优先使用 ONNX 格式模型作为替代方案。","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fissues\u002F90",{"id":133,"question_zh":134,"answer_zh":135,"source_url":136},38857,"如何添加或使用本地自定义模型（如 BS-ROFO-SW）？","从 `audio-separator` 版本 0.39.1 开始，已修复了对多茎（multi-stem）MDXC 和 Roformer 模型的支持，包括类似 BS-ROFO-SW 的模型。您可以直接下载模型文件到本地，并在代码中通过 `load_model(model_filename='您的模型文件名')` 加载。确保您的版本号在 0.39.1 或以上，以兼容这些新模型架构。","https:\u002F\u002Fgithub.com\u002Fnomadkaraoke\u002Fpython-audio-separator\u002Fissues\u002F230",{"id":138,"question_zh":139,"answer_zh":140,"source_url":116},38858,"有哪些推荐的模型可以在速度和质量之间取得良好平衡？","维护者推荐以下模型：\n1. **速度与质量平衡**：`2_HP-UVR.pth`（VR 架构），适用于简单的人声\u002F伴奏分离，速度快且效果好。\n2. **最佳质量**：RoFormer 系列模型，特别是 `model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt`，目前在所有架构中表现最佳。\n3. **其他选择**：部分 MDX 模型（如 `UVR-MDX-NET-Inst_HQ_4.onnx`）在特定场景下也表现不错，但计算成本较高。建议根据硬件资源选择合适的模型架构。",[142,146,150,154,158,162,166,170,174,178,182,186,190,194,198,202,206,210,214,218],{"id":143,"version":144,"summary_zh":80,"released_at":145},314773,"v0.44.1","2026-03-25T21:45:55",{"id":147,"version":148,"summary_zh":80,"released_at":149},314774,"v0.44.0","2026-03-25T17:53:21",{"id":151,"version":152,"summary_zh":80,"released_at":153},314775,"v0.43.1","2026-03-23T16:16:38",{"id":155,"version":156,"summary_zh":80,"released_at":157},314776,"v0.43.0","2026-03-23T02:47:10",{"id":159,"version":160,"summary_zh":80,"released_at":161},314777,"v0.42.1","2026-03-17T02:36:51",{"id":163,"version":164,"summary_zh":80,"released_at":165},314778,"v0.42.0","2026-03-16T23:15:41",{"id":167,"version":168,"summary_zh":80,"released_at":169},314779,"v0.41.1","2026-01-24T07:45:04",{"id":171,"version":172,"summary_zh":80,"released_at":173},314780,"v0.41.0","2026-01-16T02:55:54",{"id":175,"version":176,"summary_zh":80,"released_at":177},314781,"v0.40.0","2025-11-30T00:08:00",{"id":179,"version":180,"summary_zh":80,"released_at":181},314782,"v0.39.1","2025-10-15T01:09:44",{"id":183,"version":184,"summary_zh":80,"released_at":185},314783,"v0.39.0","2025-09-28T22:58:40",{"id":187,"version":188,"summary_zh":80,"released_at":189},314784,"v0.38.1","2025-09-28T06:07:10",{"id":191,"version":192,"summary_zh":80,"released_at":193},314785,"v0.38.0","2025-09-28T01:11:46",{"id":195,"version":196,"summary_zh":80,"released_at":197},314786,"v0.37.1","2025-09-24T19:27:11",{"id":199,"version":200,"summary_zh":80,"released_at":201},314787,"v0.37.0","2025-09-24T14:35:37",{"id":203,"version":204,"summary_zh":80,"released_at":205},314788,"v0.36.1","2025-08-15T19:48:23",{"id":207,"version":208,"summary_zh":80,"released_at":209},314789,"v0.36.0","2025-08-15T14:57:17",{"id":211,"version":212,"summary_zh":80,"released_at":213},314790,"v0.35.2","2025-07-26T04:23:23",{"id":215,"version":216,"summary_zh":80,"released_at":217},314791,"v0.35.1","2025-07-26T03:55:45",{"id":219,"version":220,"summary_zh":80,"released_at":221},314792,"v0.35.0","2025-07-14T17:39:05"]