154 lines
4.1 KiB
TypeScript
154 lines
4.1 KiB
TypeScript
import crypto from 'crypto';
|
|
import { useToast } from '@/hooks/useToast';
|
|
import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken';
|
|
import Graphemer from 'graphemer';
|
|
import type { ChatModelType } from '@/constants/model';
|
|
|
|
const textDecoder = new TextDecoder();
|
|
const graphemer = new Graphemer();
|
|
let encMap: Record<string, Tiktoken>;
|
|
export const getEncMap = () => {
|
|
if (encMap) return encMap;
|
|
encMap = {
|
|
'gpt-3.5-turbo': encoding_for_model('gpt-3.5-turbo', {
|
|
'<|im_start|>': 100264,
|
|
'<|im_end|>': 100265,
|
|
'<|im_sep|>': 100266
|
|
}),
|
|
'gpt-4': encoding_for_model('gpt-4', {
|
|
'<|im_start|>': 100264,
|
|
'<|im_end|>': 100265,
|
|
'<|im_sep|>': 100266
|
|
}),
|
|
'gpt-4-32k': encoding_for_model('gpt-4-32k', {
|
|
'<|im_start|>': 100264,
|
|
'<|im_end|>': 100265,
|
|
'<|im_sep|>': 100266
|
|
})
|
|
};
|
|
return encMap;
|
|
};
|
|
|
|
/**
|
|
* copy text data
|
|
*/
|
|
export const useCopyData = () => {
|
|
const { toast } = useToast();
|
|
|
|
return {
|
|
copyData: async (data: string, title: string = '复制成功') => {
|
|
try {
|
|
if (navigator.clipboard) {
|
|
await navigator.clipboard.writeText(data);
|
|
} else {
|
|
throw new Error('');
|
|
}
|
|
} catch (error) {
|
|
const textarea = document.createElement('textarea');
|
|
textarea.value = data;
|
|
document.body.appendChild(textarea);
|
|
textarea.select();
|
|
document.execCommand('copy');
|
|
document.body.removeChild(textarea);
|
|
}
|
|
|
|
toast({
|
|
title,
|
|
status: 'success',
|
|
duration: 1000
|
|
});
|
|
}
|
|
};
|
|
};
|
|
|
|
/**
|
|
* 密码加密
|
|
*/
|
|
export const createHashPassword = (text: string) => {
|
|
const hash = crypto.createHash('sha256').update(text).digest('hex');
|
|
return hash;
|
|
};
|
|
|
|
/**
|
|
* 对象转成 query 字符串
|
|
*/
|
|
export const Obj2Query = (obj: Record<string, string | number>) => {
|
|
const queryParams = new URLSearchParams();
|
|
for (const key in obj) {
|
|
queryParams.append(key, `${obj[key]}`);
|
|
}
|
|
return queryParams.toString();
|
|
};
|
|
|
|
/* 格式化 chat 聊天内容 */
|
|
function getChatGPTEncodingText(
|
|
messages: { role: 'system' | 'user' | 'assistant'; content: string; name?: string }[],
|
|
model: 'gpt-3.5-turbo' | 'gpt-4' | 'gpt-4-32k'
|
|
) {
|
|
const isGpt3 = model === 'gpt-3.5-turbo';
|
|
|
|
const msgSep = isGpt3 ? '\n' : '';
|
|
const roleSep = isGpt3 ? '\n' : '<|im_sep|>';
|
|
|
|
return [
|
|
messages
|
|
.map(({ name = '', role, content }) => {
|
|
return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`;
|
|
})
|
|
.join(msgSep),
|
|
`<|im_start|>assistant${roleSep}`
|
|
].join(msgSep);
|
|
}
|
|
function text2TokensLen(encoder: Tiktoken, inputText: string) {
|
|
const encoding = encoder.encode(inputText, 'all');
|
|
const segments: { text: string; tokens: { id: number; idx: number }[] }[] = [];
|
|
|
|
let byteAcc: number[] = [];
|
|
let tokenAcc: { id: number; idx: number }[] = [];
|
|
let inputGraphemes = graphemer.splitGraphemes(inputText);
|
|
|
|
for (let idx = 0; idx < encoding.length; idx++) {
|
|
const token = encoding[idx]!;
|
|
byteAcc.push(...encoder.decode_single_token_bytes(token));
|
|
tokenAcc.push({ id: token, idx });
|
|
|
|
const segmentText = textDecoder.decode(new Uint8Array(byteAcc));
|
|
const graphemes = graphemer.splitGraphemes(segmentText);
|
|
|
|
if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) {
|
|
segments.push({ text: segmentText, tokens: tokenAcc });
|
|
|
|
byteAcc = [];
|
|
tokenAcc = [];
|
|
inputGraphemes = inputGraphemes.slice(graphemes.length);
|
|
}
|
|
}
|
|
|
|
return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0;
|
|
}
|
|
export const countChatTokens = ({
|
|
model = 'gpt-3.5-turbo',
|
|
messages
|
|
}: {
|
|
model?: ChatModelType;
|
|
messages: { role: 'system' | 'user' | 'assistant'; content: string }[];
|
|
}) => {
|
|
const text = getChatGPTEncodingText(messages, model);
|
|
return text2TokensLen(getEncMap()[model], text);
|
|
};
|
|
|
|
export const sliceTextByToken = ({
|
|
model = 'gpt-3.5-turbo',
|
|
text,
|
|
length
|
|
}: {
|
|
model?: ChatModelType;
|
|
text: string;
|
|
length: number;
|
|
}) => {
|
|
const enc = getEncMap()[model];
|
|
const encodeText = enc.encode(text);
|
|
const decoder = new TextDecoder();
|
|
return decoder.decode(enc.decode(encodeText.slice(0, length)));
|
|
};
|