full text tmp code (#3561)

* full text tmp code

* fix: init

* fix: init

* remove tmp code

* remove tmp code

* 4818-alpha
This commit is contained in:
Archer
2025-01-10 18:03:14 +08:00
committed by GitHub
parent fadb3e3ceb
commit 47f7b1a7a3
4 changed files with 48 additions and 46 deletions

View File

@@ -33,12 +33,7 @@ const DatasetDataTextSchema = new Schema({
});
try {
DatasetDataTextSchema.index(
{ teamId: 1, datasetId: 1, fullTextToken: 'text' },
{
partialFilterExpression: { fullTextToken: { $exists: true } }
}
);
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
} catch (error) {
console.log(error);

View File

@@ -1,4 +1,4 @@
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
import { connectionMongo, getMongoModel } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
@@ -70,10 +70,7 @@ const DatasetDataSchema = new Schema({
rebuilding: Boolean,
// Abandon
fullTextToken: {
type: String,
default: ''
},
fullTextToken: String,
initFullText: Boolean
});
@@ -87,7 +84,7 @@ try {
updateTime: -1
});
// FullText tmp full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });

View File

@@ -1,6 +1,7 @@
import { NextAPI } from '@/service/middleware/entry';
import { delay } from '@fastgpt/global/common/system/utils';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
@@ -23,16 +24,36 @@ async function handler(req: NextApiRequest, res: NextApiResponse) {
const start = Date.now();
await initData(batchSize);
// await restore();
console.log('Init data time:', Date.now() - start);
success = 0;
// await batchUpdateFields();
batchUpdateFields();
return { success: true };
}
export default NextAPI(handler);
const restore = async () => {
try {
const data = await MongoDatasetData.findOne({ fullTextToken: { $exists: false } });
if (!data) return;
data.fullTextToken = jiebaSplit({ text: `${data.q}\n${data.a}`.trim() });
await data.save();
success++;
console.log('Success:', success);
await restore();
} catch (error) {
console.log(error);
await delay(500);
await restore();
}
};
const initData = async (batchSize: number) => {
try {
// 找到没有初始化的数据
@@ -59,46 +80,35 @@ const initData = async (batchSize: number) => {
})),
{ ordered: false, session, lean: true }
);
// FullText tmp 把成功插入的新数据的 dataId 更新为已初始化
// await MongoDatasetData.updateMany(
// { _id: { $in: result.map((item) => item.dataId) } },
// { $set: { initFullText: true }, $unset: { fullTextToken: 1 } },
// { session }
// );
await MongoDatasetData.updateMany(
{ _id: { $in: result.map((item) => item.dataId) } },
{ $set: { initFullText: true } },
{ session }
);
success += result.length;
console.log('Success:', success);
});
await initData(batchSize);
} catch (error) {
console.log(error, '---');
} catch (error: any) {
console.log(error, '===');
await delay(500);
await initData(batchSize);
}
};
// const batchUpdateFields = async (batchSize = 2000) => {
// // Find documents that still have these fields
// const documents = await MongoDatasetData.find({ initFullText: { $exists: true } }, '_id')
// .limit(batchSize)
// .lean();
// if (documents.length === 0) return;
// // Update in batches
// await MongoDatasetData.updateMany(
// { _id: { $in: documents.map((doc) => doc._id) } },
// {
// $unset: {
// initFullText: 1
// // fullTextToken: 1
// }
// }
// );
// success += documents.length;
// console.log('Delete success:', success);
// await batchUpdateFields(batchSize);
// };
const batchUpdateFields = async (batchSize = 2000) => {
// Update in batches
await MongoDatasetData.updateMany(
{ initFullText: { $exists: true } },
{
$unset: {
initFullText: 1,
fullTextToken: 1
}
}
);
};

View File

@@ -90,7 +90,7 @@ export async function insertData2Dataset({
q,
a,
// FullText tmp
fullTextToken: jiebaSplit({ text: qaStr }),
// fullTextToken: jiebaSplit({ text: qaStr }),
chunkIndex,
indexes: indexes?.map((item, i) => ({
...item,
@@ -243,7 +243,7 @@ export async function updateData2Dataset({
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
// FullText tmp
mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// @ts-ignore
mongoData.indexes = newIndexes;
await mongoData.save({ session });