(ns textract.core
(:gen-class)
(:require [clojure.data.json :as json]
[clojure.string :as string]
[clojure.java.io :as io]
[cognitect.aws.client.api :as aws]
[clj-time.core :as tm]
[clj-time.format :as tf]))
(def ^:dynamic *bucket* "YOUR-BUCKET-NAME")
(def key-format (tf/formatter "yyyyMMdd-HHmmss" (tm/default-time-zone)))
(def uniq-num (atom 1))
(defn base-line
"ブロックの基準位置を返す"
[block]
(let [{top :Top height :Height} (-> block :Geometry :BoundingBox)]
(+ top
(/ height 2))))
(defn extract-line-blocks
"JSON から行を表すオブジェクトのみを抽出する"
[content]
(->>
(:Blocks content)
(filter (comp #(= "LINE" %) :BlockType))
(sort-by base-line)))
(defn get-min-dist
"ブロック間の距離(y軸)で最小のものを返す"
[blocks]
(let [base-lines (map base-line blocks)]
(if (= 1 (count base-lines))
(first base-lines)
(apply min (map - (rest base-lines) base-lines)))))
(defn make-near?
"充分に接近しているかを判断する関数を返す"
[blocks]
(let [dist (* 1.5 (get-min-dist blocks))]
(fn [a b]
(if (and a b)
(< (Math/abs (- (base-line a) (base-line b)))
dist)
true))))
(defn paragraphize
"行のブロックをパラグラフ単位にまとめる"
[blocks]
(if (empty? blocks)
[]
(let [near? (make-near? blocks)]
(loop [result []
last-part []
coll blocks]
(let [[cur follow & tail] coll
new-part #(conj result
(conj last-part cur))]
(if follow
(if (near? cur follow)
(recur result
(conj last-part cur)
(cons follow tail))
(recur
(new-part)
[]
(cons follow tail)))
(new-part)))))))
(defn textize
"パラグラフをテキストにする"
[paragraphs]
(->>
paragraphs
(map #(map :Text %))
(map #(string/join "\n" %))
(string/join "\n\n")))
(defn textract
"S3 上にある文書画像を AWS API で OCR する"
[client s3-key]
(aws/invoke
client
{:op :DetectDocumentText
:request {:Document
{:S3Object {:Bucket *bucket*
:Name s3-key}}}}))
(defn s3-put
"S3 にファイルをアップロードする"
[client file-path s3-key]
(aws/invoke
client
{:op :PutObject
:request {:Bucket *bucket*
:Key s3-key
:Body (java.io.FileInputStream. file-path)}}))
(defn s3-rm
"S3 からファイルを削除する"
[client s3-key]
(aws/invoke
client
{:op :DeleteObject
:request {:Bucket *bucket*
:Key s3-key}}))
(defn -main
([]
(println "Usage: textract <INput-png> <OUTput-json> <OUTput-text>")
(println " <INput-json> <OUTput-text>"))
([input-png output-json output-text]
(let [s3-key (str (tf/unparse key-format (tm/now))
"-"
(swap! uniq-num inc)
".png")
s3-client (aws/client {:api :s3 :region "ap-southeast-1"})
tx-client (aws/client {:api :textract :region "ap-southeast-1"})]
(s3-put s3-client input-png s3-key)
(let [result (textract tx-client s3-key)
text (-> result extract-line-blocks paragraphize textize)]
(doall(map io/make-parents [output-text output-json]))
(spit output-json (json/write-str result))
(spit output-text text)
(s3-rm s3-client s3-key)
(println text))))
([input-json output-text]
(let [text (-> input-json slurp (json/read-str :key-fn keyword) extract-line-blocks paragraphize textize)]
(io/make-parents output-text)
(spit output-text text)
(println text))))
; dependencies [[org.clojure/clojure "1.10.1"]
; [org.clojure/data.json "0.2.7"]
; [com.cognitect.aws/api "0.8.391"]
; [com.cognitect.aws/endpoints "1.1.11.670"]
; [com.cognitect.aws/s3 "770.2.568.0"]
; [com.cognitect.aws/textract "747.2.533.0"]
; [clj-time "0.15.2"]]