feat: use rusty-tesseract as another backend
Also use OnceLock & LazyLock
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
#![feature(lazy_cell)]
|
||||
pub use log;
|
||||
pub use tracing::{debug, error, info, trace, warn};
|
||||
use tracing_subscriber::{self, fmt, EnvFilter};
|
||||
@ -10,6 +11,7 @@ pub fn setup_logger(level: &str) -> Result<(), fern::InitError> {
|
||||
.with_level(true)
|
||||
.with_target(true)
|
||||
.with_thread_ids(false)
|
||||
.with_line_number(true)
|
||||
.with_thread_names(false);
|
||||
let filter = EnvFilter::builder()
|
||||
.from_env()
|
||||
|
||||
@ -1,18 +0,0 @@
|
||||
pub use leptess::{LepTess, Variable};
|
||||
|
||||
pub fn init_tesseract(numeric_only: bool) -> Result<LepTess, String> {
|
||||
let mut lep_tess = match LepTess::new(None, "eng") {
|
||||
Ok(lep_tess) => lep_tess,
|
||||
Err(why) => return Err(format!("Failed to initialize Tesseract: {:?}", why)),
|
||||
};
|
||||
lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap();
|
||||
// Use LSTM only.
|
||||
lep_tess.set_variable(Variable::TesseditOcrEngineMode, "1").unwrap();
|
||||
if numeric_only {
|
||||
match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") {
|
||||
Ok(_) => (),
|
||||
Err(why) => return Err(format!("Failed to set whitelist: {:?}", why)),
|
||||
};
|
||||
}
|
||||
Ok(lep_tess)
|
||||
}
|
||||
60
swordfish-common/src/tesseract/libtesseract.rs
Normal file
60
swordfish-common/src/tesseract/libtesseract.rs
Normal file
@ -0,0 +1,60 @@
|
||||
pub use leptess::{LepTess, Variable};
|
||||
use std::{sync::{
|
||||
Arc, Mutex, LazyLock
|
||||
}, thread};
|
||||
|
||||
static TESSERACT: LazyLock<Arc<Mutex<LepTess>>> = LazyLock::new(|| {
|
||||
let mut lep_tess = match LepTess::new(None, "eng") {
|
||||
Ok(lep_tess) => lep_tess,
|
||||
Err(why) => panic!("{}", format!("Failed to initialize Tesseract: {:?}", why)),
|
||||
};
|
||||
// lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap();
|
||||
// Use LSTM only.
|
||||
lep_tess.set_variable(Variable::TesseditOcrEngineMode, "2").unwrap();
|
||||
Arc::new(Mutex::new(lep_tess))
|
||||
});
|
||||
|
||||
static mut TESSERACT_VEC: Vec<Arc<Mutex<LepTess>>> = Vec::new();
|
||||
|
||||
pub fn get_tesseract(numeric_only: bool) -> Arc<Mutex<LepTess>> {
|
||||
TESSERACT.clone()
|
||||
}
|
||||
|
||||
pub unsafe fn get_tesseract_from_vec(numeric_only: bool) -> Arc<Mutex<LepTess>> {
|
||||
let lep_tess: Arc<Mutex<LepTess>>;
|
||||
if TESSERACT_VEC.len() == 0 {
|
||||
for _ in 0..3 {
|
||||
let num_only = numeric_only.clone();
|
||||
thread::spawn(move || {
|
||||
let ocr = init_tesseract(num_only).unwrap();
|
||||
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||
});
|
||||
}
|
||||
lep_tess = Arc::new(Mutex::new(init_tesseract(numeric_only).unwrap()));
|
||||
}
|
||||
else {
|
||||
lep_tess = TESSERACT_VEC.pop().unwrap();
|
||||
thread::spawn(move || unsafe {
|
||||
let ocr = init_tesseract(numeric_only).unwrap();
|
||||
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||
});
|
||||
}
|
||||
lep_tess
|
||||
}
|
||||
|
||||
pub fn init_tesseract(numeric_only: bool) -> Result<LepTess, String> {
|
||||
let mut lep_tess = match LepTess::new(None, "eng") {
|
||||
Ok(lep_tess) => lep_tess,
|
||||
Err(why) => return Err(format!("Failed to initialize Tesseract: {:?}", why)),
|
||||
};
|
||||
lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap();
|
||||
// Use LSTM only.
|
||||
lep_tess.set_variable(Variable::TesseditOcrEngineMode, "1").unwrap();
|
||||
if numeric_only {
|
||||
match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") {
|
||||
Ok(_) => (),
|
||||
Err(why) => return Err(format!("Failed to set whitelist: {:?}", why)),
|
||||
};
|
||||
}
|
||||
Ok(lep_tess)
|
||||
}
|
||||
2
swordfish-common/src/tesseract/mod.rs
Normal file
2
swordfish-common/src/tesseract/mod.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod subprocess;
|
||||
pub mod libtesseract;
|
||||
36
swordfish-common/src/tesseract/subprocess.rs
Normal file
36
swordfish-common/src/tesseract/subprocess.rs
Normal file
@ -0,0 +1,36 @@
|
||||
pub use rusty_tesseract;
|
||||
pub use rusty_tesseract::{Args, Image};
|
||||
use std::{collections::HashMap, sync::LazyLock};
|
||||
|
||||
static TESSERACT_ARGS: LazyLock<Args> = LazyLock::new(|| Args {
|
||||
lang: "eng".to_string(),
|
||||
config_variables: HashMap::new(),
|
||||
psm: Some(6),
|
||||
dpi: None,
|
||||
oem: Some(1),
|
||||
});
|
||||
|
||||
static TESSERACT_NUMERIC_ARGS: LazyLock<Args> = LazyLock::new(|| Args {
|
||||
lang: "eng".to_string(),
|
||||
config_variables: HashMap::from([(
|
||||
"tessedit_char_whitelist".into(),
|
||||
"0123456789".into(),
|
||||
)]),
|
||||
psm: Some(6),
|
||||
dpi: None,
|
||||
oem: Some(1),
|
||||
});
|
||||
|
||||
pub fn image_to_string(image: &Image) -> Result<String, String> {
|
||||
match rusty_tesseract::image_to_string(image, &TESSERACT_ARGS) {
|
||||
Ok(text) => Ok(text),
|
||||
Err(why) => Err(format!("Failed to OCR image: {:?}", why)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn image_to_numeric_string(image: &Image) -> Result<String, String> {
|
||||
match rusty_tesseract::image_to_string(image, &TESSERACT_NUMERIC_ARGS) {
|
||||
Ok(text) => Ok(text),
|
||||
Err(why) => Err(format!("Failed to OCR image: {:?}", why)),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user