full text tmp code (#3561)

* full text tmp code

* fix: init

* fix: init

* remove tmp code

* remove tmp code

* 4818-alpha
This commit is contained in:
Archer
2025-01-10 18:03:14 +08:00
committed by GitHub
parent fadb3e3ceb
commit 47f7b1a7a3
4 changed files with 48 additions and 46 deletions

View File

@@ -33,12 +33,7 @@ const DatasetDataTextSchema = new Schema({
}); });
try { try {
DatasetDataTextSchema.index( DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
{ teamId: 1, datasetId: 1, fullTextToken: 'text' },
{
partialFilterExpression: { fullTextToken: { $exists: true } }
}
);
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true }); DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
} catch (error) { } catch (error) {
console.log(error); console.log(error);

View File

@@ -1,4 +1,4 @@
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo'; import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo; const { Schema, model, models } = connectionMongo;
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d'; import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { import {
@@ -70,10 +70,7 @@ const DatasetDataSchema = new Schema({
rebuilding: Boolean, rebuilding: Boolean,
// Abandon // Abandon
fullTextToken: { fullTextToken: String,
type: String,
default: ''
},
initFullText: Boolean initFullText: Boolean
}); });
@@ -87,7 +84,7 @@ try {
updateTime: -1 updateTime: -1
}); });
// FullText tmp full text index // FullText tmp full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }); // DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching // Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 }); DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 }); DatasetDataSchema.index({ updateTime: 1 });

View File

@@ -1,6 +1,7 @@
import { NextAPI } from '@/service/middleware/entry'; import { NextAPI } from '@/service/middleware/entry';
import { delay } from '@fastgpt/global/common/system/utils'; import { delay } from '@fastgpt/global/common/system/utils';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common'; import { authCert } from '@fastgpt/service/support/permission/auth/common';
@@ -23,16 +24,36 @@ async function handler(req: NextApiRequest, res: NextApiResponse) {
const start = Date.now(); const start = Date.now();
await initData(batchSize); await initData(batchSize);
// await restore();
console.log('Init data time:', Date.now() - start); console.log('Init data time:', Date.now() - start);
success = 0; success = 0;
// await batchUpdateFields(); batchUpdateFields();
return { success: true }; return { success: true };
} }
export default NextAPI(handler); export default NextAPI(handler);
const restore = async () => {
try {
const data = await MongoDatasetData.findOne({ fullTextToken: { $exists: false } });
if (!data) return;
data.fullTextToken = jiebaSplit({ text: `${data.q}\n${data.a}`.trim() });
await data.save();
success++;
console.log('Success:', success);
await restore();
} catch (error) {
console.log(error);
await delay(500);
await restore();
}
};
const initData = async (batchSize: number) => { const initData = async (batchSize: number) => {
try { try {
// 找到没有初始化的数据 // 找到没有初始化的数据
@@ -59,46 +80,35 @@ const initData = async (batchSize: number) => {
})), })),
{ ordered: false, session, lean: true } { ordered: false, session, lean: true }
); );
// FullText tmp 把成功插入的新数据的 dataId 更新为已初始化 // FullText tmp 把成功插入的新数据的 dataId 更新为已初始化
// await MongoDatasetData.updateMany( await MongoDatasetData.updateMany(
// { _id: { $in: result.map((item) => item.dataId) } }, { _id: { $in: result.map((item) => item.dataId) } },
// { $set: { initFullText: true }, $unset: { fullTextToken: 1 } }, { $set: { initFullText: true } },
// { session } { session }
// ); );
success += result.length; success += result.length;
console.log('Success:', success); console.log('Success:', success);
}); });
await initData(batchSize); await initData(batchSize);
} catch (error) { } catch (error: any) {
console.log(error, '---'); console.log(error, '===');
await delay(500); await delay(500);
await initData(batchSize); await initData(batchSize);
} }
}; };
// const batchUpdateFields = async (batchSize = 2000) => { const batchUpdateFields = async (batchSize = 2000) => {
// // Find documents that still have these fields // Update in batches
// const documents = await MongoDatasetData.find({ initFullText: { $exists: true } }, '_id') await MongoDatasetData.updateMany(
// .limit(batchSize) { initFullText: { $exists: true } },
// .lean(); {
$unset: {
// if (documents.length === 0) return; initFullText: 1,
fullTextToken: 1
// // Update in batches }
// await MongoDatasetData.updateMany( }
// { _id: { $in: documents.map((doc) => doc._id) } }, );
// { };
// $unset: {
// initFullText: 1
// // fullTextToken: 1
// }
// }
// );
// success += documents.length;
// console.log('Delete success:', success);
// await batchUpdateFields(batchSize);
// };

View File

@@ -90,7 +90,7 @@ export async function insertData2Dataset({
q, q,
a, a,
// FullText tmp // FullText tmp
fullTextToken: jiebaSplit({ text: qaStr }), // fullTextToken: jiebaSplit({ text: qaStr }),
chunkIndex, chunkIndex,
indexes: indexes?.map((item, i) => ({ indexes: indexes?.map((item, i) => ({
...item, ...item,
@@ -243,7 +243,7 @@ export async function updateData2Dataset({
mongoData.q = q || mongoData.q; mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a; mongoData.a = a ?? mongoData.a;
// FullText tmp // FullText tmp
mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() }); // mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// @ts-ignore // @ts-ignore
mongoData.indexes = newIndexes; mongoData.indexes = newIndexes;
await mongoData.save({ session }); await mongoData.save({ session });