monorepo packages (#344)
This commit is contained in:
56
projects/app/src/utils/file.ts
Normal file
56
projects/app/src/utils/file.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
import { getErrText } from './tools';
|
||||
import { countPromptTokens } from './common/tiktoken';
|
||||
|
||||
/**
|
||||
* text split into chunks
|
||||
* maxLen - one chunk len. max: 3500
|
||||
* overlapLen - The size of the before and after Text
|
||||
* maxLen > overlapLen
|
||||
*/
|
||||
export const splitText2Chunks = ({ text, maxLen }: { text: string; maxLen: number }) => {
|
||||
const overlapLen = Math.floor(maxLen * 0.25); // Overlap length
|
||||
|
||||
try {
|
||||
const splitTexts = text.split(/(?<=[。!?;.!?;])/g);
|
||||
const chunks: string[] = [];
|
||||
|
||||
let preChunk = '';
|
||||
let chunk = '';
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
const text = splitTexts[i];
|
||||
chunk += text;
|
||||
if (chunk.length > maxLen - overlapLen) {
|
||||
preChunk += text;
|
||||
}
|
||||
if (chunk.length >= maxLen) {
|
||||
chunks.push(chunk);
|
||||
chunk = preChunk;
|
||||
preChunk = '';
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
|
||||
|
||||
return {
|
||||
chunks,
|
||||
tokens
|
||||
};
|
||||
} catch (err) {
|
||||
throw new Error(getErrText(err));
|
||||
}
|
||||
};
|
||||
|
||||
/* simple text, remove chinese space and extra \n */
|
||||
export const simpleText = (text: string) => {
|
||||
text = text.replace(/([\u4e00-\u9fa5])\s+([\u4e00-\u9fa5])/g, '$1$2');
|
||||
text = text.replace(/\n{2,}/g, '\n');
|
||||
text = text.replace(/\s{2,}/g, ' ');
|
||||
|
||||
text = text.replace(/[\x00-\x08]/g, ' ');
|
||||
|
||||
return text;
|
||||
};
|
||||
Reference in New Issue
Block a user