一、环境准备
1.克隆gitub repo
cd ~
git clone https://github.com/InternLM/Tutorial.git -b camp4
2.创建微调文件夹
mkdir -p /root/finetune && cd /root/finetune
3.创建conda环境并激活
conda create -n xtuner-env python=3.10 -y
conda activate xtuner-env
4.安装Xtuner
cd /root/Tutorial/docs/L1/XTuner
pip install -r requirements.txt
pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
5.使用xtuner list-cfg可以验证安装
二、模型训练
1.数据准备
1.1 创建文件夹并且复制微调数据
mkdir -p /root/finetune/data && cd /root/finetune/data
cp -r /root/Tutorial/data/assistant_Tuner.jsonl /root/finetune/data
1.2 创建修改脚本
touch /root/finetune/data/change_script.py
import json
import argparse
from tqdm import tqdmdef process_line(line, old_text, new_text):# 解析 JSON 行data = json.loads(line)# 递归函数来处理嵌套的字典和列表def replace_text(obj):if isinstance(obj, dict):return {k: replace_text(v) for k, v in obj.items()}elif isinstance(obj, list):return [replace_text(item) for item in obj]elif isinstance(obj, str):return obj.replace(old_text, new_text)else:return obj# 处理整个 JSON 对象processed_data = replace_text(data)# 将处理后的对象转回 JSON 字符串return json.dumps(processed_data, ensure_ascii=False)def main(input_file, output_file, old_text, new_text):with open(input_file, 'r', encoding='utf-8') as infile, \open(output_file, 'w', encoding='utf-8') as outfile:# 计算总行数用于进度条total_lines = sum(1 for _ in infile)infile.seek(0) # 重置文件指针到开头# 使用 tqdm 创建进度条for line in tqdm(infile, total=total_lines, desc="Processing"):processed_line = process_line(line.strip(), old_text, new_text)outfile.write(processed_line + '\n')if __name__ == "__main__":parser = argparse.ArgumentParser(description="Replace text in a JSONL file.")parser.add_argument("input_file", help="Input JSONL file to process")parser.add_argument("output_file", help="Output file for processed JSONL")parser.add_argument("--old_text", default="尖米", help="Text to be replaced")parser.add_argument("--new_text", default="<你的名字>", help="Text to replace with")args = parser.parse_args()main(args.input_file, args.output_file, args.old_text, args.new_text)
将"<你的名字>"改成你的名字
1.3执行脚本
# usage:python change_script.py {input_file.jsonl} {output_file.jsonl}
cd ~/finetune/data
python change_script.py ./assistant_Tuner.jsonl ./assistant_Tuner_change.jsonl
2.训练开始
2.1 复制模型
mkdir /root/finetune/modelsln -s /root/share/new_models/Shanghai_AI_Laboratory/internlm2_5-7b-chat /root/finetune/models/internlm2_5-7b-chat
2.2准备配置文件
# cd {path/to/finetune}
cd /root/finetune
mkdir ./config
cd config
xtuner copy-cfg internlm2_5_chat_7b_qlora_alpaca_e3 ./
修改配置文件:
2.3启动微调
cd /root/finetune
conda activate xtuner-envxtuner train ./config/internlm2_5_chat_7b_qlora_alpaca_e3_copy.py --deepspeed deepspeed_zero2 --work-dir ./work_dirs/assistTuner
2.4权重转换
cd /root/finetune/work_dirs/assistTunerconda activate xtuner-env# 先获取最后保存的一个pth文件
pth_file=`ls -t /root/finetune/work_dirs/assistTuner/*.pth | head -n 1 | sed 's/:$//'`
export MKL_SERVICE_FORCE_INTEL=1
export MKL_THREADING_LAYER=GNU
xtuner convert pth_to_hf ./internlm2_5_chat_7b_qlora_alpaca_e3_copy.py ${pth_file} ./hf
2.5模型合并
cd /root/finetune/work_dirs/assistTuner
conda activate xtuner-envexport MKL_SERVICE_FORCE_INTEL=1
export MKL_THREADING_LAYER=GNU
xtuner convert merge /root/finetune/models/internlm2_5-7b-chat ./hf ./merged --max-shard-size 2GB
三、模型WebUI对话
1.安装streamlit
conda activate xtuner-envpip install streamlit==1.31.0
2.修改启动脚本
cd ~/Tutorial/tools/L1_XTuner_code
# 修改脚本xtuner_streamlit_demo.py第18行
model_name_or_path = "/root/finetune/work_dirs/assistTuner/merged"
streamlit run /root/Tutorial/tools/L1_XTuner_code/xtuner_streamlit_demo.py
3.端口映射到本地
ssh -CNg -L 8501:127.0.0.1:8501 root@ssh.intern-ai.org.cn -p *****
4.打开对话页面
在浏览器地址栏输入"http://127.0.0.1:8501"