intelligent-document-processor/app.py at main · Zsaqr/intelligent-document-processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# app.py
import io
from typing import List, Dict

import streamlit as st
from PIL import Image
import pandas as pd

from ocr_utils import ocr_image
from ner_utils import extract_entities


# ============= Helper functions ============= #

def build_highlighted_html(text: str, entities: List[Dict]) -> str:
    """
    Takes raw text + entities and returns HTML with highlighted spans.
    """

    if not text:
        return "<p><em>No text to display.</em></p>"

    if not entities:
        # Just return the plain text if no entities found
        return f"<p>{text}</p>"

    # Sort entities by start_char
    entities_sorted = sorted(entities, key=lambda e: e["start_char"])

    # Simple color palette per label (you can expand this)
    label_colors = {
        "PERSON": "#ffeeba",
        "ORG": "#bee5eb",
        "GPE": "#c3e6cb",
        "LOC": "#d6d8db",
        "DATE": "#f5c6cb",
        "TIME": "#faf2cc",
        "MONEY": "#f1c40f33",
        "DEFAULT": "#e2e3e5",
    }

    result_html = ""
    last_idx = 0

    for ent in entities_sorted:
        start = ent["start_char"]
        end = ent["end_char"]
        label = ent["label"]
        color = label_colors.get(label, label_colors["DEFAULT"])

        # Add text before the entity
        if start > last_idx:
            result_html += text[last_idx:start]

        # Add the entity span
        span_text = text[start:end]
        result_html += (
            f'<span style="background-color:{color};'
            f' padding:2px 4px; border-radius:4px; '
            f'border:1px solid #999; margin:1px;">'
            f'{span_text} '
            f'<span style="font-size:0.7em; color:#555;">[{label}]</span>'
            f'</span>'
        )
        last_idx = end

    # Add remaining text after the last entity
    if last_idx < len(text):
        result_html += text[last_idx:]

    return f"<p>{result_html}</p>"


def run_ocr_flow():
    st.subheader("1️⃣ Upload an image for OCR")

    uploaded_file = st.file_uploader(
        "Upload a scanned document / image",
        type=["png", "jpg", "jpeg"],
    )

    ocr_lang = st.selectbox(
        "OCR language (Tesseract)",
        ["eng", "ara", "eng+ara"],  # ممكن تزود مثلا "ara" لو مسطب لغة عربية
        index=0,
    )

    if uploaded_file is not None:
        # Open the image
        image_bytes = uploaded_file.read()
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

        st.image(pil_image, caption="Uploaded Image", use_column_width=True)

        if st.button("Run OCR"):
            with st.spinner("Running OCR..."):
                extracted_text = ocr_image(pil_image, lang=ocr_lang)

            st.success("OCR completed!")
            st.text_area("Extracted Text", value=extracted_text, height=200)

            return extracted_text

    return ""


def run_ner_flow(input_text: str = ""):
    st.subheader("2️⃣ Named Entity Recognition (NER)")

    text = st.text_area(
        "Enter or paste text here (or use the OCR output above 👆)",
        value=input_text,
        height=200,
    )

    model_name = "en_core_web_sm"

    if st.button("Run NER"):
        if not text.strip():
            st.warning("Please enter some text first.")
            return

        with st.spinner("Running NER..."):
            entities = extract_entities(text, model_name=model_name)

        if not entities:
            st.info("No entities found in the text.")
            return

        # Show table
        df = pd.DataFrame(entities)
        st.markdown("### Extracted Entities")
        st.dataframe(df)

        # Show highlighted text
        st.markdown("### Highlighted Text")
        highlighted_html = build_highlighted_html(text, entities)
        st.markdown(highlighted_html, unsafe_allow_html=True)


# ============= Streamlit App Layout ============= #

def main():
    st.set_page_config(
        page_title="Intelligent Document Processor (OCR + NER)",
        layout="wide",
    )

    st.title("📄 Intelligent Document Processor")
    st.write(
        "Combined **Optical Character Recognition (OCR)** and "
        "**Named Entity Recognition (NER)** pipeline."
    )

    mode = st.sidebar.radio(
        "Mode",
        ["OCR → NER Pipeline", "OCR Only", "NER Only"],
    )

    if mode == "OCR Only":
        st.header("OCR Only")
        _ = run_ocr_flow()

    elif mode == "NER Only":
        st.header("NER Only")
        run_ner_flow()

    else:
        st.header("End-to-End: OCR → NER Pipeline")
        st.write(
            "1. Upload an image (scanned document / handwritten note).\n"
            "2. Run OCR to extract text.\n"
            "3. Run NER on the extracted text."
        )

        extracted_text = run_ocr_flow()
        st.markdown("---")
        run_ner_flow(input_text=extracted_text)


if __name__ == "__main__":
    main()