feat: add ocr function

This commit is contained in:
Methapon Metanipat 2024-10-02 13:55:52 +07:00
parent adc3f8e068
commit 263d575955
4 changed files with 244 additions and 8 deletions

135
src/utils/mrz.ts Normal file
View file

@ -0,0 +1,135 @@
type MRZ = {
type: 'TD1' | 'TD2' | 'TD3';
zone: string[];
};
const MRZ_TD_1 = [
new RegExp(
[
'(?<doc_type>[0-9A-Z<]{1})',
'(?<doc_subtype>[A-Z<]{1})',
'(?<country>[0-9A-Z<]{3})',
'(?<doc_number>[0-9A-Z<]{9})',
'(?<doc_number_check>[0-9A-Z<]{1})',
'(?<complement>[0-9A-Z<]{15})',
].join(''),
),
new RegExp(
[
'(?<birth_date>[0-9A-Z<]{6})',
'(?<birth_date_check>[0-9A-Z<]{1})',
'(?<sex>[mfMF<]{1})',
'(?<expire_date>[0-9A-Z<]{6})',
'(?<expire_date_check>[0-9A-Z<]{1})',
'(?<nationality>[0-9A-Z<]{3})',
'(?<optional_data>[A-Z0-9<]{11})',
'(?<linecheck>[0-9A-Z<]{1})',
].join(''),
),
new RegExp(['(?<full_name>[A-Z<]{30})'].join('')),
];
const MRZ_TD_2 = [
new RegExp(
[
'(?<doc_type>[0-9A-Z<]{1})',
'(?<doc_subtype>[A-Z<]{1})',
'(?<country>[0-9A-Z<]{3})',
'(?<full_name>[A-Z<]{31})',
].join(''),
),
new RegExp(
[
'(?<doc_number>[0-9A-Z<]{9})',
'(?<doc_numbercheck>[0-9A-Z<]{1})',
'(?<nacionality>[0-9A-Z<]{3})',
'(?<birth_date>[0-9A-Z<]{6})',
'(?<birth_date_check>[0-9A-Z<]{1})',
'(?<sex>[mfMF]{1})',
'(?<expire_date>[0-9A-Z<]{6})',
'(?<expire_date_check>[0-9A-Z<]{1})',
'(?<optional_data>[A-Z0-9<]{7})',
'(?<line_check>[0-9A-Z<]{1})',
].join(''),
),
];
const MRZ_TD_3 = [
new RegExp(
[
'(?<doc_type>[A-Z0-9<]{1})',
'(?<doc_subtype>[A-Z0-9<]{1})',
'(?<country>[A-Z0-9]{3})',
'(?<full_name>[A-Z0-9<]{39})',
].join(''),
),
new RegExp(
[
'(?<doc_number>[0-9A-Z<]{9})',
'(?<doc_number_check>[0-9A-Z<]{1})',
'(?<nationality>[0-9A-Z<]{3})',
'(?<birth_date>[0-9A-Z<]{6})',
'(?<birth_date_check>[0-9A-Z<]{1})',
'(?<sex>[mfMF<]{1})',
'(?<expire_date>[0-9A-Z<]{6})',
'(?<expire_date_check>[0-9A-Z<]{1})',
'(?<personal_number>[A-Z0-9<]{14})',
'(?<personal_number_check>[0-9A-Z<]{1})',
'(?<linecheck>[0-9A-Z<]{1})',
].join(''),
),
];
function mrzCleanResult(obj: Record<string, string>) {
Object.entries(obj).forEach(([k, v]) => {
obj[k] = v
.replace(/</g, ' ')
.replace(/\s{2,}/, ' ')
.trim();
});
return obj;
}
export function parseType1(mrz: MRZ) {
const result: Record<string, string> = {};
mrz.zone.forEach((line, i) => {
if (MRZ_TD_1[i].test(line)) {
Object.assign(result, MRZ_TD_1[i].exec(line)?.groups);
}
});
return { mrz, result: mrzCleanResult(result) };
}
export function parseType2(mrz: MRZ) {
const result: Record<string, string> = {};
mrz.zone.forEach((line, i) => {
if (MRZ_TD_2[i].test(line)) {
Object.assign(result, MRZ_TD_2[i].exec(line)?.groups);
}
});
return { mrz, result: mrzCleanResult(result) };
}
export function parseType3(mrz: MRZ) {
const result: Record<string, string> = {};
mrz.zone.forEach((line, i) => {
if (MRZ_TD_3[i].test(line)) {
Object.assign(result, MRZ_TD_3[i].exec(line)?.groups);
}
});
return { mrz, result: mrzCleanResult(result) };
}
export function parseMRZ(mrz: MRZ) {
if (mrz.type === 'TD1') return parseType1(mrz);
if (mrz.type === 'TD2') return parseType2(mrz);
if (mrz.type === 'TD3') return parseType3(mrz);
return null;
}

38
src/utils/ocr.ts Normal file
View file

@ -0,0 +1,38 @@
import { createWorker, ImageLike, RecognizeResult } from 'tesseract.js';
import { parseMRZ } from './mrz';
let worker: Tesseract.Worker;
export function runOcr(image: ImageLike): Promise<void>;
export function runOcr<T extends (result: RecognizeResult) => void>(
image: ImageLike,
callback: T,
): Promise<ReturnType<T>>;
export async function runOcr<T extends (result: RecognizeResult) => void>(
image: ImageLike,
callback?: T,
) {
if (!worker) {
worker = await createWorker(['ocrb', 'eng', 'khm'], 1, {
langPath: '/ocr-data',
});
}
if (callback) return callback(await worker.recognize(image));
}
export function parseResultMRZ(result: RecognizeResult) {
const zone = result.data.text
.split(/[\s\r\n]+/)
.filter((v) => /[A-Z0-9<]{30,}/.test(v));
if (zone.length === 3 && zone[0].length === 30) {
return parseMRZ({ type: 'TD1', zone });
} else if (zone.length === 2 && zone[0].length === 36) {
return parseMRZ({ type: 'TD2', zone });
} else if (zone.length === 2 && zone[0].length === 44) {
return parseMRZ({ type: 'TD3', zone });
} else {
return null;
}
}