full text tmp code (#3561)

* full text tmp code

* fix: init

* fix: init

* remove tmp code

* remove tmp code

* 4818-alpha
This commit is contained in:
Archer
2025-01-10 18:03:14 +08:00
committed by GitHub
parent fadb3e3ceb
commit 47f7b1a7a3
4 changed files with 48 additions and 46 deletions

View File

@@ -1,6 +1,7 @@
import { NextAPI } from '@/service/middleware/entry';
import { delay } from '@fastgpt/global/common/system/utils';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
@@ -23,16 +24,36 @@ async function handler(req: NextApiRequest, res: NextApiResponse) {
const start = Date.now();
await initData(batchSize);
// await restore();
console.log('Init data time:', Date.now() - start);
success = 0;
// await batchUpdateFields();
batchUpdateFields();
return { success: true };
}
export default NextAPI(handler);
const restore = async () => {
try {
const data = await MongoDatasetData.findOne({ fullTextToken: { $exists: false } });
if (!data) return;
data.fullTextToken = jiebaSplit({ text: `${data.q}\n${data.a}`.trim() });
await data.save();
success++;
console.log('Success:', success);
await restore();
} catch (error) {
console.log(error);
await delay(500);
await restore();
}
};
const initData = async (batchSize: number) => {
try {
// 找到没有初始化的数据
@@ -59,46 +80,35 @@ const initData = async (batchSize: number) => {
})),
{ ordered: false, session, lean: true }
);
// FullText tmp 把成功插入的新数据的 dataId 更新为已初始化
// await MongoDatasetData.updateMany(
// { _id: { $in: result.map((item) => item.dataId) } },
// { $set: { initFullText: true }, $unset: { fullTextToken: 1 } },
// { session }
// );
await MongoDatasetData.updateMany(
{ _id: { $in: result.map((item) => item.dataId) } },
{ $set: { initFullText: true } },
{ session }
);
success += result.length;
console.log('Success:', success);
});
await initData(batchSize);
} catch (error) {
console.log(error, '---');
} catch (error: any) {
console.log(error, '===');
await delay(500);
await initData(batchSize);
}
};
// const batchUpdateFields = async (batchSize = 2000) => {
// // Find documents that still have these fields
// const documents = await MongoDatasetData.find({ initFullText: { $exists: true } }, '_id')
// .limit(batchSize)
// .lean();
// if (documents.length === 0) return;
// // Update in batches
// await MongoDatasetData.updateMany(
// { _id: { $in: documents.map((doc) => doc._id) } },
// {
// $unset: {
// initFullText: 1
// // fullTextToken: 1
// }
// }
// );
// success += documents.length;
// console.log('Delete success:', success);
// await batchUpdateFields(batchSize);
// };
const batchUpdateFields = async (batchSize = 2000) => {
// Update in batches
await MongoDatasetData.updateMany(
{ initFullText: { $exists: true } },
{
$unset: {
initFullText: 1,
fullTextToken: 1
}
}
);
};

View File

@@ -90,7 +90,7 @@ export async function insertData2Dataset({
q,
a,
// FullText tmp
fullTextToken: jiebaSplit({ text: qaStr }),
// fullTextToken: jiebaSplit({ text: qaStr }),
chunkIndex,
indexes: indexes?.map((item, i) => ({
...item,
@@ -243,7 +243,7 @@ export async function updateData2Dataset({
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
// FullText tmp
mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// mongoData.fullTextToken = jiebaSplit({ text: `${mongoData.q}\n${mongoData.a}`.trim() });
// @ts-ignore
mongoData.indexes = newIndexes;
await mongoData.save({ session });