feat: api dataset support pdf parse;fix: chunk reader auth (#4117)

* feat: api dataset support pdf parse

* fix: chunk reader auth
This commit is contained in:
Archer
2025-03-12 12:41:19 +08:00
committed by archer
parent 30f83f848d
commit 1a3613cd2c
27 changed files with 378 additions and 355 deletions

View File

@@ -5,25 +5,25 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import { LinkedListResponse, LinkedPaginationProps } from '@fastgpt/web/common/fetch/type';
import { FilterQuery, Types } from 'mongoose';
import { dataFieldSelector, processChatTimeFilter } from './getQuote';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { quoteDataFieldSelector, QuoteDataItemType } from '@/service/core/chat/constants';
import { processChatTimeFilter } from '@/service/core/chat/utils';
import { ChatErrEnum } from '@fastgpt/global/common/error/code/chat';
export type GetCollectionQuoteProps = LinkedPaginationProps & {
chatTime: Date;
chatId: string;
chatItemDataId: string;
isInitialLoad: boolean;
collectionId: string;
chatItemId: string;
appId: string;
chatId: string;
shareId?: string;
outLinkUid?: string;
teamId?: string;
teamToken?: string;
};
export type GetCollectionQuoteRes = LinkedListResponse<DatasetDataSchemaType>;
export type GetCollectionQuoteRes = LinkedListResponse<QuoteDataItemType>;
type BaseMatchType = FilterQuery<DatasetDataSchemaType>;
@@ -37,11 +37,10 @@ async function handler(
prevIndex,
nextId,
nextIndex,
chatTime,
isInitialLoad,
collectionId,
chatItemId,
chatItemDataId,
appId,
chatId,
shareId,
@@ -53,61 +52,50 @@ async function handler(
const limitedPageSize = Math.min(pageSize, 30);
try {
await authDatasetCollection({
const [{ chat }, { chatItem }] = await Promise.all([
authChatCrud({
req,
authToken: true,
authApiKey: true,
collectionId: req.body.collectionId,
per: ReadPermissionVal
});
} catch (error) {
await Promise.all([
authChatCrud({
req,
authToken: true,
appId,
chatId,
shareId,
outLinkUid,
teamId,
teamToken
}),
authCollectionInChat({ appId, chatId, chatItemId, collectionId })
]);
}
appId,
chatId,
shareId,
outLinkUid,
teamId,
teamToken
}),
authCollectionInChat({ appId, chatId, chatItemDataId, collectionIds: [collectionId] })
]);
if (!chat) return Promise.reject(ChatErrEnum.unAuthChat);
const baseMatch: BaseMatchType = {
collectionId,
$or: [
{ updateTime: { $lt: new Date(chatTime) } },
{ history: { $elemMatch: { updateTime: { $lt: new Date(chatTime) } } } }
{ updateTime: { $lt: new Date(chatItem.time) } },
{ history: { $elemMatch: { updateTime: { $lt: new Date(chatItem.time) } } } }
]
};
if (initialId && initialIndex !== undefined) {
return await handleInitialLoad(
return await handleInitialLoad({
initialId,
initialIndex,
limitedPageSize,
chatTime,
chatItemId,
pageSize: limitedPageSize,
chatTime: chatItem.time,
isInitialLoad,
baseMatch
);
});
}
if ((prevId && prevIndex !== undefined) || (nextId && nextIndex !== undefined)) {
return await handlePaginatedLoad(
return await handlePaginatedLoad({
prevId,
prevIndex,
nextId,
nextIndex,
limitedPageSize,
chatTime,
chatItemId,
pageSize: limitedPageSize,
chatTime: chatItem.time,
baseMatch
);
});
}
return { list: [], hasMorePrev: false, hasMoreNext: false };
@@ -115,38 +103,39 @@ async function handler(
export default NextAPI(handler);
async function handleInitialLoad(
initialId: string,
initialIndex: number,
pageSize: number,
chatTime: Date,
chatItemId: string,
isInitialLoad: boolean,
baseMatch: BaseMatchType
): Promise<GetCollectionQuoteRes> {
async function handleInitialLoad({
initialId,
initialIndex,
pageSize,
chatTime,
isInitialLoad,
baseMatch
}: {
initialId: string;
initialIndex: number;
pageSize: number;
chatTime: Date;
isInitialLoad: boolean;
baseMatch: BaseMatchType;
}): Promise<GetCollectionQuoteRes> {
const centerNode = await MongoDatasetData.findOne(
{
_id: new Types.ObjectId(initialId)
},
dataFieldSelector
quoteDataFieldSelector
).lean();
if (!centerNode) {
if (isInitialLoad) {
const list = await MongoDatasetData.find(baseMatch, dataFieldSelector)
const list = await MongoDatasetData.find(baseMatch, quoteDataFieldSelector)
.sort({ chunkIndex: 1, _id: -1 })
.limit(pageSize)
.lean();
const listRes = list.map((item, index) => ({
...item,
index: item.chunkIndex
}));
const hasMoreNext = list.length === pageSize;
return {
list: listRes,
list: processChatTimeFilter(list, chatTime),
hasMorePrev: false,
hasMoreNext
};
@@ -173,28 +162,30 @@ async function handleInitialLoad(
const resultList = [...prevList, centerNode, ...nextList];
const list = processChatTimeFilter(resultList, chatTime);
return {
list: list.map((item) => ({
...item,
index: item.chunkIndex
})),
list: processChatTimeFilter(resultList, chatTime),
hasMorePrev,
hasMoreNext
};
}
async function handlePaginatedLoad(
prevId: string | undefined,
prevIndex: number | undefined,
nextId: string | undefined,
nextIndex: number | undefined,
pageSize: number,
chatTime: Date,
chatItemId: string,
baseMatch: BaseMatchType
): Promise<GetCollectionQuoteRes> {
async function handlePaginatedLoad({
prevId,
prevIndex,
nextId,
nextIndex,
pageSize,
chatTime,
baseMatch
}: {
prevId: string | undefined;
prevIndex: number | undefined;
nextId: string | undefined;
nextIndex: number | undefined;
pageSize: number;
chatTime: Date;
baseMatch: BaseMatchType;
}): Promise<GetCollectionQuoteRes> {
const { list, hasMore } =
prevId && prevIndex !== undefined
? await getPrevNodes(prevId, prevIndex, pageSize, baseMatch)
@@ -203,10 +194,7 @@ async function handlePaginatedLoad(
const processedList = processChatTimeFilter(list, chatTime);
return {
list: processedList.map((item) => ({
...item,
index: item.chunkIndex
})),
list: processedList,
hasMorePrev: !!prevId && hasMore,
hasMoreNext: !!nextId && hasMore
};
@@ -217,7 +205,10 @@ async function getPrevNodes(
initialIndex: number,
limit: number,
baseMatch: BaseMatchType
) {
): Promise<{
list: DatasetDataSchemaType[];
hasMore: boolean;
}> {
const match: BaseMatchType = {
...baseMatch,
$or: [
@@ -226,7 +217,7 @@ async function getPrevNodes(
]
};
const list = await MongoDatasetData.find(match, dataFieldSelector)
const list = await MongoDatasetData.find(match, quoteDataFieldSelector)
.sort({ chunkIndex: -1, _id: 1 })
.limit(limit)
.lean();
@@ -242,7 +233,10 @@ async function getNextNodes(
initialIndex: number,
limit: number,
baseMatch: BaseMatchType
) {
): Promise<{
list: DatasetDataSchemaType[];
hasMore: boolean;
}> {
const match: BaseMatchType = {
...baseMatch,
$or: [
@@ -251,7 +245,7 @@ async function getNextNodes(
]
};
const list = await MongoDatasetData.find(match, dataFieldSelector)
const list = await MongoDatasetData.find(match, quoteDataFieldSelector)
.sort({ chunkIndex: 1, _id: -1 })
.limit(limit)
.lean();

View File

@@ -1,100 +1,64 @@
import { NextAPI } from '@/service/middleware/entry';
import { authChatCrud, authCollectionInChat } from '@/service/support/permission/auth/chat';
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import { quoteDataFieldSelector, QuoteDataItemType } from '@/service/core/chat/constants';
import { processChatTimeFilter } from '@/service/core/chat/utils';
import { ChatErrEnum } from '@fastgpt/global/common/error/code/chat';
export type GetQuoteDataProps = {
datasetDataIdList: string[];
chatTime: Date;
collectionIdList: string[];
chatItemId: string;
appId: string;
chatId: string;
chatItemDataId: string;
appId: string;
shareId?: string;
outLinkUid?: string;
teamId?: string;
teamToken?: string;
};
export type GetQuoteDataRes = {
quoteList: DatasetDataSchemaType[];
};
export const dataFieldSelector =
'_id datasetId collectionId q a chunkIndex history updateTime currentChatItemId prevId';
export type GetQuoteDataRes = QuoteDataItemType[];
async function handler(req: ApiRequestProps<GetQuoteDataProps>): Promise<GetQuoteDataRes> {
const {
datasetDataIdList,
chatTime,
appId,
chatId,
chatItemDataId,
shareId,
outLinkUid,
teamId,
teamToken,
collectionIdList,
chatItemId,
chatId,
appId,
shareId,
outLinkUid,
teamId,
teamToken
datasetDataIdList
} = req.body;
await authChatCrud({
req,
authToken: true,
appId,
chatId,
shareId,
outLinkUid,
teamId,
teamToken
});
await Promise.all(
collectionIdList.map(async (collectionId) => {
await authCollectionInChat({ appId, chatId, chatItemId, collectionId });
})
);
const [chat, { chatItem }] = await Promise.all([
authChatCrud({
req,
authToken: true,
appId,
chatId,
shareId,
outLinkUid,
teamId,
teamToken
}),
authCollectionInChat({ appId, chatId, chatItemDataId, collectionIds: collectionIdList })
]);
if (!chat) return Promise.reject(ChatErrEnum.unAuthChat);
const list = await MongoDatasetData.find(
{ _id: { $in: datasetDataIdList } },
dataFieldSelector
{ _id: { $in: datasetDataIdList }, collectionId: { $in: collectionIdList } },
quoteDataFieldSelector
).lean();
const quoteList = processChatTimeFilter(list, chatTime);
const quoteList = processChatTimeFilter(list, chatItem.time);
return {
quoteList
};
return quoteList;
}
export default NextAPI(handler);
export function processChatTimeFilter(list: DatasetDataSchemaType[], chatTime: Date) {
return list.map((item) => {
if (!item.history) return item;
const { history, ...rest } = item;
const formatedChatTime = new Date(chatTime);
if (item.updateTime <= formatedChatTime) {
return rest;
}
const latestHistoryIndex = history.findIndex(
(historyItem: any) => historyItem.updateTime <= formatedChatTime
);
if (latestHistoryIndex === -1) return rest;
const latestHistory = history[latestHistoryIndex];
return {
...rest,
q: latestHistory?.q || item.q,
a: latestHistory?.a || item.a,
updated: true
};
});
}

View File

@@ -2,11 +2,7 @@ import type { NextApiRequest } from 'next';
import type { ApiDatasetCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
@@ -16,7 +12,8 @@ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const { name, apiFileId, ...body } = req.body as ApiDatasetCreateDatasetCollectionParams;
const { name, apiFileId, customPdfParse, ...body } =
req.body as ApiDatasetCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@@ -50,7 +47,8 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
yuqueServer,
apiFileId,
teamId,
tmbId
tmbId,
customPdfParse
});
const { collectionId, insertResults } = await createCollectionAndInsertData({
@@ -62,11 +60,12 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
teamId,
tmbId,
type: DatasetCollectionTypeEnum.apiFile,
name: name,
name,
apiFileId,
metadata: {
relatedImgId: apiFileId
}
},
customPdfParse
}
});

View File

@@ -1,9 +1,13 @@
import { NextAPI } from '@/service/middleware/entry';
import { authChatCrud, authCollectionInChat } from '@/service/support/permission/auth/chat';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { OutLinkChatAuthProps } from '@fastgpt/global/support/permission/chat';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { useIPFrequencyLimit } from '@fastgpt/service/common/middle/reqFrequencyLimit';
import { readFromSecondary } from '@fastgpt/service/common/mongo/utils';
import { responseWriteController } from '@fastgpt/service/common/response';
import { addLog } from '@fastgpt/service/common/system/log';
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
import { ApiRequestProps } from '@fastgpt/service/type/next';
@@ -11,21 +15,69 @@ import { NextApiResponse } from 'next';
export type ExportCollectionBody = {
collectionId: string;
appId?: string;
chatId?: string;
chatItemDataId?: string;
chatTime: Date;
};
} & OutLinkChatAuthProps;
async function handler(req: ApiRequestProps<ExportCollectionBody, {}>, res: NextApiResponse) {
let { collectionId, chatTime } = req.body;
const { teamId, collection } = await authDatasetCollection({
req,
authToken: true,
const {
collectionId,
per: ReadPermissionVal
});
appId,
chatId,
chatItemDataId,
shareId,
outLinkUid,
teamId,
teamToken,
chatTime
} = req.body;
const { collection, teamId: userTeamId } = await (async () => {
if (!appId || !chatId || !chatItemDataId) {
return authDatasetCollection({
req,
authToken: true,
authApiKey: true,
collectionId: req.body.collectionId,
per: ReadPermissionVal
});
}
/*
1. auth chat read permission
2. auth collection quote in chat
3. auth outlink open show quote
*/
const [authRes, collection] = await Promise.all([
authChatCrud({
req,
authToken: true,
appId,
chatId,
shareId,
outLinkUid,
teamId,
teamToken
}),
getCollectionWithDataset(collectionId),
authCollectionInChat({ appId, chatId, chatItemDataId, collectionIds: [collectionId] })
]);
if (!authRes.showRawSource) {
return Promise.reject(DatasetErrEnum.unAuthDatasetFile);
}
return {
...authRes,
collection
};
})();
const where = {
teamId,
teamId: userTeamId,
datasetId: collection.datasetId,
collectionId,
...(chatTime

View File

@@ -19,7 +19,7 @@ export type readCollectionSourceBody = {
appId?: string;
chatId?: string;
chatItemId?: string;
chatItemDataId?: string;
} & OutLinkChatAuthProps;
export type readCollectionSourceResponse = {
@@ -30,7 +30,7 @@ export type readCollectionSourceResponse = {
async function handler(
req: ApiRequestProps<readCollectionSourceBody, readCollectionSourceQuery>
): Promise<readCollectionSourceResponse> {
const { collectionId, appId, chatId, chatItemId, shareId, outLinkUid, teamId, teamToken } =
const { collectionId, appId, chatId, chatItemDataId, shareId, outLinkUid, teamId, teamToken } =
req.body;
const {
@@ -39,7 +39,7 @@ async function handler(
tmbId: uid,
authType
} = await (async () => {
if (!appId || !chatId || !chatItemId) {
if (!appId || !chatId || !chatItemDataId) {
return authDatasetCollection({
req,
authToken: true,
@@ -66,7 +66,7 @@ async function handler(
teamToken
}),
getCollectionWithDataset(collectionId),
authCollectionInChat({ appId, chatId, chatItemId, collectionId })
authCollectionInChat({ appId, chatId, chatItemDataId, collectionIds: [collectionId] })
]);
if (!authRes.showRawSource) {

View File

@@ -231,7 +231,6 @@ const Chat = ({ myApps }: { myApps: AppListItemType[] }) => {
{quoteData && (
<PageContainer flex={'1 0 0'} w={0} maxW={'560px'}>
<ChatQuoteList
chatTime={quoteData.chatTime}
rawSearch={quoteData.rawSearch}
metadata={quoteData.metadata}
onClose={() => setQuoteData(undefined)}

View File

@@ -304,7 +304,6 @@ const OutLink = (props: Props) => {
{quoteData && (
<PageContainer flex={'1 0 0'} w={0} maxW={'560px'}>
<ChatQuoteList
chatTime={quoteData.chatTime}
rawSearch={quoteData.rawSearch}
metadata={quoteData.metadata}
onClose={() => setQuoteData(undefined)}

View File

@@ -245,7 +245,6 @@ const Chat = ({ myApps }: { myApps: AppListItemType[] }) => {
{quoteData && (
<PageContainer flex={'1 0 0'} w={0} maxW={'560px'}>
<ChatQuoteList
chatTime={quoteData.chatTime}
rawSearch={quoteData.rawSearch}
metadata={quoteData.metadata}
onClose={() => setQuoteData(undefined)}