feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
@@ -49,7 +49,7 @@ const EditFolderModal = ({
|
||||
defaultValue={name}
|
||||
placeholder={t('common:dataset.Folder Name') || ''}
|
||||
autoFocus
|
||||
maxLength={20}
|
||||
maxLength={100}
|
||||
/>
|
||||
</ModalBody>
|
||||
<ModalFooter>
|
||||
|
||||
@@ -10,11 +10,21 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
||||
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import { TabEnum } from '../NavBar';
|
||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { UseFormReturn, useForm } from 'react-hook-form';
|
||||
import { ImportSourceItemType } from '@/web/core/dataset/type';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
getMaxChunkSize,
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize,
|
||||
chunkAutoChunkSize,
|
||||
minChunkSize,
|
||||
getAutoIndexSize,
|
||||
getMaxIndexSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
type TrainingFiledType = {
|
||||
chunkOverlapRatio: number;
|
||||
@@ -22,6 +32,9 @@ type TrainingFiledType = {
|
||||
minChunkSize: number;
|
||||
autoChunkSize: number;
|
||||
chunkSize: number;
|
||||
maxIndexSize?: number;
|
||||
indexSize?: number;
|
||||
autoIndexSize?: number;
|
||||
charsPointsPrice: number;
|
||||
priceTip: string;
|
||||
uploadRate: number;
|
||||
@@ -47,9 +60,13 @@ export type ImportFormType = {
|
||||
autoIndexes: boolean;
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum;
|
||||
|
||||
chunkSplitMode: DataChunkSplitModeEnum;
|
||||
embeddingChunkSize: number;
|
||||
qaChunkSize: number;
|
||||
customSplitChar: string;
|
||||
chunkSplitter: string;
|
||||
indexSize: number;
|
||||
|
||||
qaPrompt: string;
|
||||
webSelector: string;
|
||||
};
|
||||
@@ -199,9 +216,12 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
embeddingChunkSize: vectorModel?.defaultToken || 512,
|
||||
qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
|
||||
customSplitChar: '',
|
||||
|
||||
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||
embeddingChunkSize: 2000,
|
||||
indexSize: vectorModel?.defaultToken || 512,
|
||||
qaChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
chunkSplitter: '',
|
||||
qaPrompt: Prompt_AgentQA.description,
|
||||
webSelector: '',
|
||||
customPdfParse: false
|
||||
@@ -215,17 +235,18 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
|
||||
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
|
||||
const qaChunkSize = processParamsForm.watch('qaChunkSize');
|
||||
const customSplitChar = processParamsForm.watch('customSplitChar');
|
||||
const chunkSplitter = processParamsForm.watch('chunkSplitter');
|
||||
const autoIndexes = processParamsForm.watch('autoIndexes');
|
||||
const indexSize = processParamsForm.watch('indexSize');
|
||||
|
||||
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
chunkSizeField: 'qaChunkSize',
|
||||
chunkOverlapRatio: 0,
|
||||
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7),
|
||||
minChunkSize: 4000,
|
||||
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
|
||||
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
||||
minChunkSize: 1000,
|
||||
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||
chunkSize: qaChunkSize,
|
||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||
@@ -237,10 +258,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
chunkOverlapRatio: 0.2,
|
||||
maxChunkSize: 2048,
|
||||
minChunkSize: 100,
|
||||
autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024,
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
chunkSize: embeddingChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
indexSize,
|
||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||
price: agentModel.charsPointsPrice
|
||||
@@ -251,10 +275,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
return {
|
||||
chunkSizeField: 'embeddingChunkSize',
|
||||
chunkOverlapRatio: 0.2,
|
||||
maxChunkSize: vectorModel?.maxToken || 512,
|
||||
minChunkSize: 100,
|
||||
autoChunkSize: vectorModel?.defaultToken || 512,
|
||||
maxChunkSize: getMaxChunkSize(agentModel),
|
||||
minChunkSize: minChunkSize,
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
chunkSize: embeddingChunkSize,
|
||||
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
indexSize,
|
||||
charsPointsPrice: vectorModel.charsPointsPrice || 0,
|
||||
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
|
||||
price: vectorModel.charsPointsPrice
|
||||
@@ -265,30 +292,36 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
||||
}, [
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
agentModel.maxResponse,
|
||||
agentModel.maxContext,
|
||||
agentModel.charsPointsPrice,
|
||||
agentModel,
|
||||
qaChunkSize,
|
||||
t,
|
||||
vectorModel.defaultToken,
|
||||
vectorModel?.maxToken,
|
||||
vectorModel.charsPointsPrice,
|
||||
embeddingChunkSize
|
||||
embeddingChunkSize,
|
||||
vectorModel,
|
||||
indexSize
|
||||
]);
|
||||
|
||||
const chunkSettingModeMap = useMemo(() => {
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return {
|
||||
chunkSize: TrainingModeMap.autoChunkSize,
|
||||
customSplitChar: ''
|
||||
indexSize: TrainingModeMap.autoIndexSize,
|
||||
chunkSplitter: ''
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
chunkSize: TrainingModeMap.chunkSize,
|
||||
customSplitChar
|
||||
indexSize: TrainingModeMap.indexSize,
|
||||
chunkSplitter
|
||||
};
|
||||
}
|
||||
}, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]);
|
||||
}, [
|
||||
chunkSettingMode,
|
||||
TrainingModeMap.autoChunkSize,
|
||||
TrainingModeMap.autoIndexSize,
|
||||
TrainingModeMap.chunkSize,
|
||||
TrainingModeMap.indexSize,
|
||||
chunkSplitter
|
||||
]);
|
||||
|
||||
const contextValue = {
|
||||
...TrainingModeMap,
|
||||
|
||||
@@ -20,10 +20,11 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionDataProcessModeMap
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
|
||||
import { useSystemStore } from '@/web/common/system/useSystemStore';
|
||||
import MyModal from '@fastgpt/web/components/common/MyModal';
|
||||
@@ -37,25 +38,39 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
||||
import { shadowLight } from '@fastgpt/web/styles/theme';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import MySelect from '@fastgpt/web/components/common/MySelect';
|
||||
import { getIndexSizeSelectList } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
||||
|
||||
function DataProcess() {
|
||||
const { t } = useTranslation();
|
||||
const { feConfigs } = useSystemStore();
|
||||
|
||||
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } =
|
||||
useContextSelector(DatasetImportContext, (v) => v);
|
||||
const {
|
||||
goToNext,
|
||||
processParamsForm,
|
||||
chunkSizeField,
|
||||
minChunkSize,
|
||||
maxChunkSize,
|
||||
maxIndexSize,
|
||||
indexSize
|
||||
} = useContextSelector(DatasetImportContext, (v) => v);
|
||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||
const { setValue, register, watch, getValues } = processParamsForm;
|
||||
|
||||
const trainingType = watch('trainingType');
|
||||
const chunkSettingMode = watch('chunkSettingMode');
|
||||
const trainingModeList = useMemo(() => {
|
||||
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
||||
return list
|
||||
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
|
||||
.map(([key, value]) => ({
|
||||
title: t(value.label as any),
|
||||
value: key as DatasetCollectionDataProcessModeEnum,
|
||||
tooltip: t(value.tooltip as any)
|
||||
}));
|
||||
}, [t]);
|
||||
|
||||
const qaPrompt = watch('qaPrompt');
|
||||
const {
|
||||
isOpen: isOpenCustomPrompt,
|
||||
onOpen: onOpenCustomPrompt,
|
||||
onClose: onCloseCustomPrompt
|
||||
} = useDisclosure();
|
||||
const chunkSettingMode = watch('chunkSettingMode');
|
||||
const chunkSplitMode = watch('chunkSplitMode');
|
||||
|
||||
const customSplitList = [
|
||||
{ label: t('dataset:split_sign_null'), value: '' },
|
||||
@@ -69,25 +84,25 @@ function DataProcess() {
|
||||
{ label: t('dataset:split_sign_custom'), value: 'Other' }
|
||||
];
|
||||
|
||||
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar'));
|
||||
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
|
||||
useEffect(() => {
|
||||
if (customListSelectValue === 'Other') {
|
||||
setValue('customSplitChar', '');
|
||||
setValue('chunkSplitter', '');
|
||||
} else {
|
||||
setValue('customSplitChar', customListSelectValue);
|
||||
setValue('chunkSplitter', customListSelectValue);
|
||||
}
|
||||
}, [customListSelectValue, setValue]);
|
||||
|
||||
const trainingModeList = useMemo(() => {
|
||||
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
||||
return list
|
||||
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
|
||||
.map(([key, value]) => ({
|
||||
title: t(value.label as any),
|
||||
value: key as DatasetCollectionDataProcessModeEnum,
|
||||
tooltip: t(value.tooltip as any)
|
||||
}));
|
||||
}, [t]);
|
||||
// Index size
|
||||
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
|
||||
|
||||
// QA
|
||||
const qaPrompt = watch('qaPrompt');
|
||||
const {
|
||||
isOpen: isOpenCustomPrompt,
|
||||
onOpen: onOpenCustomPrompt,
|
||||
onClose: onCloseCustomPrompt
|
||||
} = useDisclosure();
|
||||
|
||||
const Title = useCallback(({ title }: { title: string }) => {
|
||||
return (
|
||||
@@ -237,67 +252,97 @@ function DataProcess() {
|
||||
children: chunkSettingMode === ChunkSettingModeEnum.custom && (
|
||||
<Box mt={5}>
|
||||
<Box>
|
||||
<Flex alignItems={'center'}>
|
||||
<Box>{t('dataset:ideal_chunk_length')}</Box>
|
||||
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} />
|
||||
</Flex>
|
||||
<Box
|
||||
mt={1}
|
||||
css={{
|
||||
'& > span': {
|
||||
display: 'block'
|
||||
<RadioGroup<DataChunkSplitModeEnum>
|
||||
list={[
|
||||
{
|
||||
title: t('dataset:split_chunk_size'),
|
||||
value: DataChunkSplitModeEnum.size
|
||||
},
|
||||
{
|
||||
title: t('dataset:split_chunk_char'),
|
||||
value: DataChunkSplitModeEnum.char,
|
||||
tooltip: t('dataset:custom_split_sign_tip')
|
||||
}
|
||||
]}
|
||||
value={chunkSplitMode}
|
||||
onChange={(e) => {
|
||||
setValue('chunkSplitMode', e);
|
||||
}}
|
||||
>
|
||||
<MyTooltip
|
||||
label={t('common:core.dataset.import.Chunk Range', {
|
||||
min: minChunkSize,
|
||||
max: maxChunkSize
|
||||
})}
|
||||
/>
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||
<Box
|
||||
mt={1.5}
|
||||
css={{
|
||||
'& > span': {
|
||||
display: 'block'
|
||||
}
|
||||
}}
|
||||
>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={chunkSizeField}
|
||||
min={minChunkSize}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
step={100}
|
||||
/>
|
||||
</MyTooltip>
|
||||
</Box>
|
||||
<MyTooltip
|
||||
label={t('common:core.dataset.import.Chunk Range', {
|
||||
min: minChunkSize,
|
||||
max: maxChunkSize
|
||||
})}
|
||||
>
|
||||
<MyNumberInput
|
||||
register={register}
|
||||
name={chunkSizeField}
|
||||
min={minChunkSize}
|
||||
max={maxChunkSize}
|
||||
size={'sm'}
|
||||
step={100}
|
||||
/>
|
||||
</MyTooltip>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{chunkSplitMode === DataChunkSplitModeEnum.char && (
|
||||
<HStack mt={1.5}>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
onChange={(val) => {
|
||||
setCustomListSelectValue(val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('chunkSplitter')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
)}
|
||||
</Box>
|
||||
|
||||
<Box mt={3}>
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
|
||||
<Box>
|
||||
{t('common:core.dataset.import.Custom split char')}
|
||||
<QuestionTip label={t('dataset:custom_split_sign_tip')} />
|
||||
</Box>
|
||||
|
||||
<HStack mt={1}>
|
||||
<Box flex={'1 0 0'}>
|
||||
<MySelect<string>
|
||||
list={customSplitList}
|
||||
size={'sm'}
|
||||
<Flex alignItems={'center'} mt={3}>
|
||||
<Box>{t('dataset:index_size')}</Box>
|
||||
<QuestionTip label={t('dataset:index_size_tips')} />
|
||||
</Flex>
|
||||
<Box mt={1}>
|
||||
<MySelect<number>
|
||||
bg={'myGray.50'}
|
||||
value={customListSelectValue}
|
||||
h={'32px'}
|
||||
list={indexSizeSeletorList}
|
||||
value={indexSize}
|
||||
onChange={(val) => {
|
||||
setCustomListSelectValue(val);
|
||||
setValue('indexSize', val);
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
{customListSelectValue === 'Other' && (
|
||||
<Input
|
||||
flex={'1 0 0'}
|
||||
h={'32px'}
|
||||
size={'sm'}
|
||||
bg={'myGray.50'}
|
||||
placeholder="\n;======;==SPLIT=="
|
||||
{...register('customSplitChar')}
|
||||
/>
|
||||
)}
|
||||
</HStack>
|
||||
</Box>
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{showQAPromptInput && (
|
||||
<Box mt={3}>
|
||||
|
||||
@@ -16,6 +16,7 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
|
||||
import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import Markdown from '@/components/Markdown';
|
||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
const PreviewData = () => {
|
||||
const { t } = useTranslation();
|
||||
@@ -23,6 +24,7 @@ const PreviewData = () => {
|
||||
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
|
||||
|
||||
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
|
||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||
|
||||
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
||||
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
||||
@@ -36,12 +38,13 @@ const PreviewData = () => {
|
||||
async () => {
|
||||
if (!previewFile) return;
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
||||
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: previewFile.rawText || '',
|
||||
chunkLen: chunkSize,
|
||||
chunkSize,
|
||||
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : []
|
||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||
});
|
||||
return chunks.map((chunk) => ({
|
||||
q: chunk,
|
||||
@@ -61,9 +64,12 @@ const PreviewData = () => {
|
||||
|
||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||
|
||||
trainingType: processParamsForm.getValues('trainingType'),
|
||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||
chunkSize,
|
||||
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||
|
||||
selector: processParamsForm.getValues('webSelector'),
|
||||
isQAImport: importSource === ImportDataSourceEnum.csvTable,
|
||||
|
||||
@@ -49,7 +49,7 @@ const Upload = () => {
|
||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||
const retrainNewCollectionId = useRef('');
|
||||
|
||||
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } =
|
||||
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
|
||||
useContextSelector(DatasetImportContext, (v) => v);
|
||||
|
||||
const { handleSubmit } = processParamsForm;
|
||||
@@ -81,7 +81,7 @@ const Upload = () => {
|
||||
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
||||
|
||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||
async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => {
|
||||
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
|
||||
if (sources.length === 0) return;
|
||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||
|
||||
@@ -111,10 +111,16 @@ const Upload = () => {
|
||||
trainingType,
|
||||
imageIndex: processParamsForm.getValues('imageIndex'),
|
||||
autoIndexes: processParamsForm.getValues('autoIndexes'),
|
||||
|
||||
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||
|
||||
chunkSize,
|
||||
chunkSplitter: customSplitChar,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||
};
|
||||
|
||||
if (importSource === ImportDataSourceEnum.reTraining) {
|
||||
const res = await postReTrainingDatasetFileCollection({
|
||||
...commonParams,
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
import React from 'react';
|
||||
import { Box } from '@chakra-ui/react';
|
||||
import { ImportSourceItemType } from '@/web/core/dataset/type';
|
||||
import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer';
|
||||
import { getPreviewChunks } from '@/web/core/dataset/api';
|
||||
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetImportContext } from '../Context';
|
||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { getPreviewSourceReadType } from '../utils';
|
||||
|
||||
const PreviewChunks = ({
|
||||
previewSource,
|
||||
onClose
|
||||
}: {
|
||||
previewSource: ImportSourceItemType;
|
||||
onClose: () => void;
|
||||
}) => {
|
||||
const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useContextSelector(
|
||||
DatasetImportContext,
|
||||
(v) => v
|
||||
);
|
||||
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
|
||||
|
||||
const { data = [], loading: isLoading } = useRequest2(
|
||||
async () => {
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: previewSource.rawText || '',
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : []
|
||||
});
|
||||
return chunks.map((chunk) => ({
|
||||
q: chunk,
|
||||
a: ''
|
||||
}));
|
||||
}
|
||||
|
||||
return getPreviewChunks({
|
||||
datasetId,
|
||||
type: getPreviewSourceReadType(previewSource),
|
||||
sourceId:
|
||||
previewSource.dbFileId ||
|
||||
previewSource.link ||
|
||||
previewSource.externalFileUrl ||
|
||||
previewSource.apiFileId ||
|
||||
'',
|
||||
|
||||
chunkSize,
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||
|
||||
selector: processParamsForm.getValues('webSelector'),
|
||||
isQAImport: importSource === ImportDataSourceEnum.csvTable,
|
||||
externalFileId: previewSource.externalFileId
|
||||
});
|
||||
},
|
||||
{
|
||||
manual: false
|
||||
}
|
||||
);
|
||||
|
||||
return (
|
||||
<MyRightDrawer
|
||||
onClose={onClose}
|
||||
iconSrc={previewSource.icon}
|
||||
title={previewSource.sourceName}
|
||||
isLoading={isLoading}
|
||||
maxW={['90vw', '40vw']}
|
||||
px={0}
|
||||
>
|
||||
<Box overflowY={'auto'} px={5} fontSize={'sm'}>
|
||||
{data.map((item, index) => (
|
||||
<Box
|
||||
key={index}
|
||||
whiteSpace={'pre-wrap'}
|
||||
fontSize={'sm'}
|
||||
p={4}
|
||||
bg={index % 2 === 0 ? 'white' : 'myWhite.600'}
|
||||
mb={3}
|
||||
borderRadius={'md'}
|
||||
borderWidth={'1px'}
|
||||
borderColor={'borderColor.low'}
|
||||
boxShadow={'2'}
|
||||
_notLast={{
|
||||
mb: 2
|
||||
}}
|
||||
>
|
||||
<Box color={'myGray.900'}>{item.q}</Box>
|
||||
<Box color={'myGray.500'}>{item.a}</Box>
|
||||
</Box>
|
||||
))}
|
||||
</Box>
|
||||
</MyRightDrawer>
|
||||
);
|
||||
};
|
||||
|
||||
export default React.memo(PreviewChunks);
|
||||
@@ -8,10 +8,11 @@ import { useRouter } from 'next/router';
|
||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||
import { getDatasetCollectionById } from '@/web/core/dataset/api';
|
||||
import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
||||
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
|
||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||
import { Box } from '@chakra-ui/react';
|
||||
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||
|
||||
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
||||
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
|
||||
@@ -23,7 +24,6 @@ const ReTraining = () => {
|
||||
collectionId: string;
|
||||
};
|
||||
|
||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
|
||||
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
|
||||
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
||||
@@ -46,18 +46,21 @@ const ReTraining = () => {
|
||||
uploadedFileRate: 100
|
||||
}
|
||||
]);
|
||||
|
||||
processParamsForm.reset({
|
||||
customPdfParse: collection.customPdfParse,
|
||||
trainingType: collection.trainingType,
|
||||
imageIndex: collection.imageIndex,
|
||||
autoIndexes: collection.autoIndexes,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
|
||||
embeddingChunkSize: collection.chunkSize,
|
||||
qaChunkSize: collection.chunkSize,
|
||||
customSplitChar: collection.chunkSplitter,
|
||||
qaPrompt: collection.qaPrompt,
|
||||
webSelector: collection.metadata?.webPageSelector
|
||||
indexSize: collection.indexSize || 512,
|
||||
chunkSplitter: collection.chunkSplitter,
|
||||
webSelector: collection.metadata?.webPageSelector,
|
||||
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user