feat: insert data de-weight;perf: input queue

This commit is contained in:
archer
2023-05-28 20:13:19 +08:00
parent 7e99f905bc
commit 516618b0cd
12 changed files with 187 additions and 105 deletions

View File

@@ -16,6 +16,10 @@ export type Props = {
prompt?: string;
};
export type Response = {
insertLen: number;
};
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
const { kbId, data, mode, prompt } = req.body as Props;
@@ -28,7 +32,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
// 凭证校验
const { userId } = await authUser({ req });
jsonRes(res, {
jsonRes<Response>(res, {
data: await pushDataToKb({
kbId,
data,
@@ -51,16 +55,12 @@ export async function pushDataToKb({
data,
mode,
prompt
}: { userId: string } & Props) {
}: { userId: string } & Props): Promise<Response> {
await authKb({
userId,
kbId
});
if (data.length === 0) {
return {};
}
// 过滤重复的 qa 内容
const set = new Set();
const filterData: {
@@ -75,41 +75,54 @@ export async function pushDataToKb({
set.add(text);
}
});
// 数据库去重
// const searchRes = await Promise.allSettled(
// data.map(async ({ q, a = '' }) => {
// if (!q) {
// return Promise.reject('q为空');
// }
const insertData = (
await Promise.allSettled(
filterData.map(async ({ q, a = '' }) => {
if (mode !== TrainingModeEnum.index) {
return Promise.resolve({
q,
a
});
}
// q = q.replace(/\\n/g, '\n');
// a = a.replace(/\\n/g, '\n');
if (!q) {
return Promise.reject('q为空');
}
// // Exactly the same data, not push
// try {
// const count = await PgClient.count('modelData', {
// where: [['user_id', userId], 'AND', ['kb_id', kbId], 'AND', ['q', q], 'AND', ['a', a]]
// });
q = q.replace(/\\n/g, '\n').trim().replace(/'/g, '"');
a = a.replace(/\\n/g, '\n').trim().replace(/'/g, '"');
// if (count > 0) {
// return Promise.reject('已经存在');
// }
// } catch (error) {
// error;
// }
// return Promise.resolve({
// q,
// a
// });
// })
// );
// const filterData = searchRes
// .filter((item) => item.status === 'fulfilled')
// .map<{ q: string; a: string }>((item: any) => item.value);
// Exactly the same data, not push
try {
const { rows } = await PgClient.query(`
SELECT COUNT(*) > 0 AS exists
FROM modelData
WHERE md5(q)=md5('${q}') AND md5(a)=md5('${a}') AND user_id='${userId}' AND kb_id='${kbId}'
`);
const exists = rows[0]?.exists || false;
if (exists) {
return Promise.reject('已经存在');
}
} catch (error) {
console.log(error);
error;
}
return Promise.resolve({
q,
a
});
})
)
)
.filter((item) => item.status === 'fulfilled')
.map<{ q: string; a: string }>((item: any) => item.value);
// 插入记录
await TrainingData.insertMany(
data.map((item) => ({
insertData.map((item) => ({
q: item.q,
a: item.a,
userId,
@@ -119,9 +132,11 @@ export async function pushDataToKb({
}))
);
startQueue();
insertData.length > 0 && startQueue();
return {};
return {
insertLen: insertData.length
};
}
export const config = {

View File

@@ -32,10 +32,10 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
await PgClient.update('modelData', {
where: [['id', dataId], 'AND', ['user_id', userId]],
values: [
{ key: 'a', value: a },
{ key: 'a', value: a.replace(/'/g, '"') },
...(q
? [
{ key: 'q', value: q },
{ key: 'q', value: q.replace(/'/g, '"') },
{ key: 'vector', value: `[${vector[0]}]` }
]
: [])

View File

@@ -54,7 +54,7 @@ const InputDataModal = ({
setLoading(true);
try {
const res = await postKbDataFromList({
const { insertLen } = await postKbDataFromList({
kbId,
data: [
{
@@ -65,14 +65,22 @@ const InputDataModal = ({
mode: TrainingModeEnum.index
});
toast({
title: res === 0 ? '可能已存在完全一致的数据' : '导入数据成功,需要一段时间训练',
status: 'success'
});
reset({
a: '',
q: ''
});
if (insertLen === 0) {
toast({
title: '已存在完全一致的数据',
status: 'warning'
});
} else {
toast({
title: '导入数据成功,需要一段时间训练',
status: 'success'
});
reset({
a: '',
q: ''
});
}
onSuccess();
} catch (err: any) {
toast({

View File

@@ -37,6 +37,7 @@ const SelectJsonModal = ({
const { toast } = useToast();
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: false });
const [fileData, setFileData] = useState<{ q: string; a: string }[]>([]);
const [successData, setSuccessData] = useState(0);
const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该数据集?'
});
@@ -67,27 +68,35 @@ const SelectJsonModal = ({
[setSelecting, toast]
);
const { mutate, isLoading } = useMutation({
const { mutate, isLoading: uploading } = useMutation({
mutationFn: async () => {
if (!fileData || fileData.length === 0) return;
const res = await postKbDataFromList({
kbId,
data: fileData,
mode: TrainingModeEnum.index
});
let success = 0;
// subsection import
const step = 50;
for (let i = 0; i < fileData.length; i += step) {
const { insertLen } = await postKbDataFromList({
kbId,
data: fileData.slice(i, i + step),
mode: TrainingModeEnum.index
});
success += insertLen || 0;
setSuccessData((state) => state + step);
}
toast({
title: `导入数据成功,最终导入: ${res || 0} 条数据。需要一段时间训练`,
title: `导入数据成功,最终导入: ${success} 条数据。需要一段时间训练`,
status: 'success',
duration: 4000
});
onClose();
onSuccess();
},
onError() {
onError(err) {
toast({
title: '导入文件失败',
title: getErrText(err, '导入文件失败'),
status: 'error'
});
}
@@ -121,15 +130,15 @@ const SelectJsonModal = ({
csv模板
</Box>
<Flex alignItems={'center'}>
<Button isLoading={selecting} onClick={onOpen}>
<Button isLoading={selecting} isDisabled={uploading} onClick={onOpen}>
csv
</Button>
<Box ml={4}> {fileData.length} </Box>
<Box ml={4}> {fileData.length} 100</Box>
</Flex>
</Box>
<Box flex={'3 0 0'} h={'100%'} overflow={'auto'} p={2} backgroundColor={'blackAlpha.50'}>
{fileData.map((item, index) => (
{fileData.slice(0, 100).map((item, index) => (
<Box key={index}>
<Box>
Q{index + 1}. {item.q}
@@ -144,15 +153,15 @@ const SelectJsonModal = ({
<Flex px={6} pt={2} pb={4}>
<Box flex={1}></Box>
<Button variant={'outline'} mr={3} onClick={onClose}>
<Button variant={'outline'} isLoading={uploading} mr={3} onClick={onClose}>
</Button>
<Button
isLoading={isLoading}
isDisabled={fileData.length === 0}
onClick={openConfirm(mutate)}
>
<Button isDisabled={fileData.length === 0 || uploading} onClick={openConfirm(mutate)}>
{uploading ? (
<Box>{Math.round((successData / fileData.length) * 100)}%</Box>
) : (
'确认导入'
)}
</Button>
</Flex>
</ModalContent>

View File

@@ -55,9 +55,14 @@ const SelectFileModal = ({
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [mode, setMode] = useState<`${TrainingModeEnum}`>(TrainingModeEnum.index);
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({
const [splitRes, setSplitRes] = useState<{
tokens: number;
chunks: string[];
successChunks: number;
}>({
tokens: 0,
chunks: []
chunks: [],
successChunks: 0
});
const { openConfirm, ConfirmChild } = useConfirm({
content: `确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,未完成的任务会被直接清除。一共 ${
@@ -104,19 +109,30 @@ const SelectFileModal = ({
[toast]
);
const { mutate, isLoading } = useMutation({
const { mutate, isLoading: uploading } = useMutation({
mutationFn: async () => {
if (splitRes.chunks.length === 0) return;
await postKbDataFromList({
kbId,
data: splitRes.chunks.map((text) => ({ q: text, a: '' })),
prompt: `下面是"${prompt || '一段长文本'}"`,
mode
});
// subsection import
let success = 0;
const step = 50;
for (let i = 0; i < splitRes.chunks.length; i += step) {
const { insertLen } = await postKbDataFromList({
kbId,
data: splitRes.chunks.slice(i, i + step).map((text) => ({ q: text, a: '' })),
prompt: `下面是"${prompt || '一段长文本'}"`,
mode
});
success += insertLen;
setSplitRes((state) => ({
...state,
successChunks: state.successChunks + step
}));
}
toast({
title: '导入数据成功,需要一段拆解和训练. 重复数据会自动删除',
title: `去重后共导入 ${success} 条数据,需要一段拆解和训练.`,
status: 'success'
});
onClose();
@@ -148,7 +164,8 @@ const SelectFileModal = ({
setSplitRes({
tokens: splitRes.reduce((sum, item) => sum + item.tokens, 0),
chunks: splitRes.map((item) => item.chunks).flat()
chunks: splitRes.map((item) => item.chunks).flat(),
successChunks: 0
});
await promise;
@@ -235,6 +252,11 @@ const SelectFileModal = ({
...fileTextArr.slice(i + 1)
]);
}}
onBlur={(e) => {
if (fileTextArr.length > 1 && e.target.value === '') {
setFileTextArr((state) => [...state.slice(0, i), ...state.slice(i + 1)]);
}
}}
/>
</Box>
))}
@@ -242,19 +264,22 @@ const SelectFileModal = ({
</ModalBody>
<Flex px={6} pt={2} pb={4}>
<Button isLoading={btnLoading} onClick={onOpen}>
<Button isLoading={btnLoading} isDisabled={uploading} onClick={onOpen}>
</Button>
<Box flex={1}></Box>
<Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
<Button variant={'outline'} isLoading={uploading} mr={3} onClick={onClose}>
</Button>
<Button
isLoading={isLoading || btnLoading}
isDisabled={isLoading || btnLoading || fileTextArr[0] === ''}
isDisabled={uploading || btnLoading || fileTextArr[0] === ''}
onClick={onclickImport}
>
{uploading ? (
<Box>{Math.round((splitRes.successChunks / splitRes.chunks.length) * 100)}%</Box>
) : (
'确认导入'
)}
</Button>
</Flex>
</ModalContent>