Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -28,7 +28,7 @@ const APIDatasetCollection = () => {
return (
<>
{activeStep === 0 && <CustomAPIFileInput />}
{activeStep === 1 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);
@@ -272,7 +272,7 @@ const CustomAPIFileInput = () => {
onClick={onclickNext}
>
{selectFiles.length > 0
? `${t('common:core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>

View File

@@ -34,7 +34,7 @@ const ExternalFileCollection = () => {
return (
<>
{activeStep === 0 && <CustomLinkInput />}
{activeStep === 1 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@@ -19,7 +19,7 @@ const CustomTet = () => {
return (
<>
{activeStep === 0 && <CustomTextInput />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@@ -23,7 +23,7 @@ const LinkCollection = () => {
return (
<>
{activeStep === 0 && <CustomLinkImport />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <Upload />}
</>
);

View File

@@ -10,9 +10,8 @@ import { RenderUploadFiles } from '../components/RenderFiles';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
const DataProcess = dynamic(() => import('../commonProgress/DataProcess'), {
loading: () => <Loading fixed={false} />
});
const DataProcess = dynamic(() => import('../commonProgress/DataProcess'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
const Upload = dynamic(() => import('../commonProgress/Upload'));
const fileType = '.txt, .docx, .csv, .xlsx, .pdf, .md, .html, .pptx';
@@ -23,8 +22,9 @@ const FileLocal = () => {
return (
<>
{activeStep === 0 && <SelectFile />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 2 && <Upload />}
{activeStep === 1 && <DataProcess />}
{activeStep === 2 && <PreviewData />}
{activeStep === 3 && <Upload />}
</>
);
};
@@ -64,12 +64,12 @@ const SelectFile = React.memo(function SelectFile() {
/>
{/* render files */}
<RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} showPreviewContent />
<RenderUploadFiles files={selectFiles} setFiles={setSelectFiles} />
<Box textAlign={'right'} mt={5}>
<Button isDisabled={successFiles.length === 0 || uploading} onClick={onclickNext}>
{selectFiles.length > 0
? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>

View File

@@ -8,10 +8,13 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { ImportProcessWayEnum } from '@/web/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { Box } from '@chakra-ui/react';
const Upload = dynamic(() => import('../commonProgress/Upload'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
const ReTraining = () => {
const router = useRouter();
@@ -20,6 +23,7 @@ const ReTraining = () => {
collectionId: string;
};
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
@@ -43,8 +47,12 @@ const ReTraining = () => {
}
]);
processParamsForm.reset({
mode: collection.trainingType,
way: ImportProcessWayEnum.auto,
customPdfParse: collection.customPdfParse,
trainingType: collection.trainingType,
imageIndex: collection.imageIndex,
autoIndexes: collection.autoIndexes,
chunkSettingMode: ChunkSettingModeEnum.auto,
embeddingChunkSize: collection.chunkSize,
qaChunkSize: collection.chunkSize,
customSplitChar: collection.chunkSplitter,
@@ -55,9 +63,12 @@ const ReTraining = () => {
});
return (
<MyBox isLoading={loading} h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess showPreviewChunks={true} />}
{activeStep === 1 && <Upload />}
<MyBox isLoading={loading} h={'100%'}>
<Box h={'100%'} overflow={'auto'}>
{activeStep === 0 && <DataProcess />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</Box>
</MyBox>
);
};

View File

@@ -21,7 +21,7 @@ const FileLocal = () => {
return (
<>
{activeStep === 0 && <SelectFile />}
{activeStep === 1 && <PreviewData showPreviewChunks />}
{activeStep === 1 && <PreviewData />}
{activeStep === 2 && <Upload />}
</>
);
@@ -91,7 +91,7 @@ const SelectFile = React.memo(function SelectFile() {
}}
>
{selectFiles.length > 0
? `${t('core.dataset.import.Total files', { total: selectFiles.length })} | `
? `${t('dataset:total_num_files', { total: selectFiles.length })} | `
: ''}
{t('common:common.Next Step')}
</Button>