Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 90 additions & 27 deletions markup_doc/labeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import html
import json
import re
import requests
import unicodedata

import requests

Expand Down Expand Up @@ -737,6 +739,8 @@ def match_section(item, sections):


def match_subsection(item, sections):
if len(sections) <=2:
return None
return (
{"label": "<sub-sec>", "body": True}
if (
Expand All @@ -748,41 +752,80 @@ def match_subsection(item, sections):
)


def normalize_text(text):
text = re.sub(r'<[^>]+>', '', text) # quita etiquetas
text = text.strip().lower()
text = unicodedata.normalize('NFD', text)
text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
return text


def comes_before_or_equal(obj1, obj2):
p1 = normalize_text(obj1.get('value', {}).get('paragraph', ''))
p2 = normalize_text(obj2.get('value', {}).get('paragraph', ''))
return p1 <= p2


def is_probable_heading(text, max_chars=100, max_words=5):
if not text:
return False

words = text.split()

# Si es muy largo, probablemente es párrafo
if len(text) > max_chars:
return False

if len(words) > max_words:
return False

return True


def create_labeled_object2(i, item, state, sections):
obj = {}
result = None

if match_section(item, sections):
result = match_section(item, sections)
state["label"] = result.get("label")
state["body"] = result.get("body")
raw_text = item.get('text', '').strip()
text = raw_text.lower()

if match_subsection(item, sections):
result = match_subsection(item, sections)
state["label"] = result.get("label")
state["body"] = result.get("body")
is_references_title = bool(re.fullmatch(
r"(?:referencias|references|referências)\s*[:.]?",
text
))

if (
state.get("body")
and re.search(r"^(refer)", item.get("text").lower())
and match_section(item, sections)
):
state["label"] = "<sec>"
state["body"] = False
state["back"] = True
obj["type"] = "paragraph"
obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
is_heading_candidate = is_probable_heading(raw_text)

if state.get("body") and re.search(
r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower()
):
state["label"] = "<sec>"
state["body"] = False
state["back"] = True
result = {"label": "<sec>", "body": False, "back": True}
obj["type"] = "paragraph"
obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
# Si es título de referencias, debe poder pasar aunque falle otra regla
if is_references_title:
is_heading_candidate = True

if is_heading_candidate:
section_result = match_section(item, sections)

if section_result:
result = section_result
state['label'] = result.get('label')
state['body'] = result.get('body')

else:
subsection_result = match_subsection(item, sections)

if subsection_result:
result = subsection_result
state['label'] = result.get('label')
state['body'] = result.get('body')

if state.get('body') and is_references_title:
state['label'] = '<sec>'
state['body'] = False
state['back'] = True

obj['type'] = 'paragraph'
obj['value'] = {
'label': state['label'],
'paragraph': item.get('text')
}

if not result:
result = {"label": "<p>", "body": state["body"], "back": state["back"]}
Expand Down Expand Up @@ -1291,6 +1334,7 @@ def append_fragment(node_dest, val):
# - quitar saltos de línea
clean = re.sub(r"(?i)<br\s*/?>", "", val)
clean = clean.replace("\n", "")
clean = re.sub(r'<(?![/a-zA-Z_])', '&lt;', clean)

# normaliza entidades problemáticas
clean = clean.replace("&nbsp;", " ")
Expand Down Expand Up @@ -1385,3 +1429,22 @@ def proccess_special_content(text, data_body):
)

return res


def split_abstract_inline(text):
if not text:
return None

pattern = r'(?is)^\s*(?:<italic>)?\s*(abstract|resumen|resumo)\s*(?:</italic>)?\s*[:.]\s*(.+)$'
match = re.match(pattern, text)

if not match:
return None

abstract_title = match.group(1).strip()
abstract_text = match.group(2).strip()

if not abstract_text:
return None

return abstract_title, abstract_text
1 change: 1 addition & 0 deletions markup_doc/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,7 @@ class ArticleDocxMarkup(CommonControlField, ClusterableModel):
spsversion = models.TextField(_("Sps version"), null=True, blank=True)
artdate = models.DateField(_("Artdate"), null=True, blank=True)
ahpdate = models.DateField(_("Ahpdate"), null=True, blank=True)
dateiso = models.TextField(_("Dateiso"), null=True, blank=True)

file_xml = models.FileField(
null=True,
Expand Down
158 changes: 118 additions & 40 deletions markup_doc/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
process_reference,
process_references,
split_in_three,
create_special_content_object,
split_abstract_inline
)
from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx
from markup_doc.sync_api import sync_issues_from_api, sync_journals_from_api
Expand Down Expand Up @@ -206,6 +208,9 @@ def get_labels(article_id, user_id):
next_item = None
obj_reference = []
llm_first_block = None
obj_postreference = []
last_obj = None
llama_model = False

for i, item in enumerate(content):
if next_item:
Expand All @@ -214,17 +219,37 @@ def get_labels(article_id, user_id):

obj = {}
if item.get("type") in [
"<abstract>",
"<date-accepted>",
"<date-received>",
"<kwd-group>",
]:
"<abstract>",
"<date-accepted>",
"<date-received>",
"<kwd-group>"
]:
if item.get("type") == "<abstract>":
if i + 1 < len(content):
inline_abstract = split_abstract_inline(item.get("text"))

if inline_abstract:
abstract_title, abstract_text = inline_abstract

obj["type"] = "paragraph"
obj["value"] = {
"label": "<abstract-title>",
"paragraph": item.get("text"),
"paragraph": abstract_title
}
stream_data.append(obj.copy())

obj["type"] = "paragraph_with_language"
obj["value"] = {
"label": "<abstract>",
"paragraph": abstract_text,
"language": langid.classify(abstract_text)[0] or None
}
stream_data.append(obj.copy())

elif i + 1 < len(content):
obj["type"] = "paragraph"
obj["value"] = {
"label": "<abstract-title>",
"paragraph": item.get("text")
}
stream_data.append(obj.copy())

Expand Down Expand Up @@ -369,14 +394,12 @@ def get_labels(article_id, user_id):
stream_data_body.append(obj)
continue

if item.get("text") is None or item.get("text") == "":
state["label_next"] = (
state["label_next_reset"] if state["reset"] else state["label_next"]
)
if state["back"]:
state["back"] = False
state["body"] = False
state["references"] = True
if item.get('text') is None or item.get('text') == '':
state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
if state['back'] and num_ref > 0:
#state['back'] = False
state['body'] = False
state['references'] = True
else:
obj, result, state = create_labeled_object2(i, item, state, sections)

Expand All @@ -403,35 +426,57 @@ def get_labels(article_id, user_id):
stream_data.append(obj)
else:
stream_data_body.append(obj)
elif state["back"]:
if state["label"] == "<sec>":
elif state['back']:
if state['label'] == '<sec>':
stream_data_back.append(obj)
if state["label"] == "<p>":
if state['label'] == '<p>':
num_ref = num_ref + 1
# obj = {}#process_reference(num_ref, obj, user_id)
obj_reference.append(
{
"num_ref": num_ref,
"obj": obj,
"text": obj["value"]["paragraph"],
}
)
# stream_data_back.append(obj)
#obj = {}#process_reference(num_ref, obj, user_id)
obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
#stream_data_back.append(obj)
else:
stream_data.append(obj)

num_refs = [item["num_ref"] for item in obj_reference]

if get_llm_model_name() == "LLAMA":
for obj_ref in obj_reference:
obj = process_reference(obj_ref["num_ref"], obj_ref["obj"], user_id)
stream_data_back.append(obj)
obj = process_reference(obj_ref['num_ref'], obj_ref['obj'], user_id)

is_reference = obj.get('is_reference', True)

# Por si el modelo devuelve "false" como string
if isinstance(is_reference, str):
is_reference = is_reference.lower() == 'true'

if is_reference:
# Opcional: quitar is_reference si no lo necesitas en el StreamField
obj.pop('is_reference', None)
stream_data_back.append(obj)

else:
full_text = (
obj.get('full_text')
or obj_ref.get('text')
or obj_ref.get('obj', {}).get('text')
or ''
)

obj_no_reference = {
'type': 'paragraph',
'value': {
'label': '<p>',
'paragraph': full_text
}
}

obj_postreference.append(obj_no_reference)

else:
if llm_first_block is None:
llm_first_block = LlamaService(mode="prompt", temperature=0.1)
chunks = split_in_three(obj_reference)
output = []

output_reference = []
num_refs_reference = []
logger.info(
"get_labels: processando %d referências com Gemini (%d chunks)",
len(obj_reference),
Expand All @@ -440,21 +485,54 @@ def get_labels(article_id, user_id):

for chunk in chunks:
if len(chunk) > 0:
text_references = (
"\n".join([item["text"] for item in chunk])
.replace("<italic>", "")
.replace("</italic>", "")
)
text_references = "\n".join(
[item["text"] for item in chunk]
).replace('<italic>', '').replace('</italic>', '')

prompt_reference = create_prompt_reference(text_references)

result = llm_first_block.run(prompt_reference)

match = re.search(r"\[.*\]", result, re.DOTALL)
match = re.search(r'\[.*\]', result, re.DOTALL)

if match:
parsed = json.loads(match.group(0))
output.extend(parsed) # Agrega a la lista de salida

stream_data_back.extend(process_references(num_refs, output))
for index, item_response in enumerate(parsed):
if index >= len(chunk):
continue

original_item = chunk[index]

is_reference = item_response.get('is_reference', True)

# Por si el modelo regresa "false" como texto
if isinstance(is_reference, str):
is_reference = is_reference.lower() == 'true'

if is_reference:
num_refs_reference.append(original_item["num_ref"])
output_reference.append(item_response)

else:
full_text = (
item_response.get('full_text')
or original_item.get('text')
or ''
)

obj_no_reference = {
'type': 'paragraph',
'value': {
'label': '<p>',
'paragraph': full_text
}
}

obj_postreference.append(obj_no_reference)

stream_data_back.extend(process_references(num_refs_reference, output_reference))
stream_data_back.extend(obj_postreference)

# data_front is never iterated inside get_xml — rescue any <p> items that the
# state machine left in stream_data (body paragraphs misclassified as front
Expand Down
Loading
Loading