--- base_model: - microsoft/markuplm-base library_name: transformers pipeline_tag: token-classification tags: - html - parser - web-crawler - news - crawler --- This is fine-tuned PoC of [markuplm-base](https://huggingface.co/microsoft/markuplm-base) model for parsing news attributes from web-pages: author, pulication date, content and etc. **Inference example** Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content). ```python id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"} def eval(url): current_dir = os.path.dirname(os.path.abspath(__file__)) model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root model_name = 'OxMarkupLM.pt' processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") processor.parse_html = False model_path = os.path.join(model_folder, model_name) model = MarkupLMForTokenClassification.from_pretrained( model_path, id2label=labels.id2label, label2id=labels.label2id ) html = utils.clean_html(utils.get_html_content(url)) data = [utils.extract_nodes_and_feautures(html)] example = utils.split_sliding_data(data, 10, 0) title, author, date, content = [], [], [], [] for splited in example: nodes, xpaths = splited['nodes'], splited['xpaths'] encoding = processor( nodes=nodes, xpaths=xpaths, return_offsets_mapping=True, padding="max_length", truncation=True, max_length=512, return_tensors="pt" ) offset_mapping = encoding.pop("offset_mapping") with torch.no_grad(): logits = model(**encoding).logits predictions = logits.argmax(-1) processed_words = [] for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()): if word_id is not None and offset[0] == 0: if pred_id == 1: title.append(nodes[word_id]) elif pred_id == 2 and word_id not in processed_words: processed_words.append(word_id) content.append(nodes[word_id]) elif pred_id == 3: author.append(nodes[word_id]) elif pred_id == 4: date.append(nodes[word_id]) title = rank_titles(title, '\n'.join(content)) return { "model_name": model_name, "url": url, "title": title, "author": author, "date": date, "content": content, } ``` **More details** More details how this model was finetuned you can find here [Building LLM crawler](https://medium.com/@olga.rondareva/building-llm-crawler-cb73f46c73ad)