{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting boilerpy3\n", " Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)\n", "Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)\n", "Installing collected packages: boilerpy3\n", "Successfully installed boilerpy3-1.0.7\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install boilerpy3" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "raw_html = \"\"\"\n", "Boston Public Schools
\"Margarita
Margarita Muniz Academy

20 Child St Jamaica Plain, MA 02130

School / Provider Type:Boston Public School

Ages / Grades Served:7, 8, 9, 10, 11, 12

School Quality Tier:4

This school has eligibility requirements. Check eligibility with a student to see if they qualify.

Preview Session Date 1:11/20/2024, 9:00 AM - 10:30 AM In-Person

Preview Session Date 2:12/17/2024, 9:30 AM - 11:00 AM In-Person

Preview Session Date 3:1/22/2025, 1:00 AM - 2:30 AM In-Person

Move left
Move right
Move up
Move down
+Zoom in
-Zoom out
HomeJump left by 75%
EndJump right by 75%
Page UpJump up by 75%
Page DownJump down by 75%
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
To navigate, press the arrow keys.
\"\"
82
\"\"
41
\"\"
74
\"\"
103
\"\"
233
\"\"
65
\"\"
141
\"\"
37
\"\"
15
\"\"
17
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"

Margarita Muniz Academy

7-12

  • \"Google\"
    Map data ©2025 Google
    Map data ©2025 Google

    846 Schools

    Sort By

    This data represents schools and programs for the 2025-26 school year and is subject to change. Great Starts would love your feedback: https://forms.gle/1kDr6byBat5HGHZF7

    \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from boilerpy3 import extractors\n", "extractor = extractors.DefaultExtractor()\n", "\n", "content = extractor.get_content(html)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "School / Provider Type:Boston Public School\n", "Ages / Grades Served:7, 8, 9, 10, 11, 12\n", "School Quality Tier:4\n", "This school has eligibility requirements. Check eligibility with a student to see if they qualify.\n", "About\n", "Program\n", "Phone:(617) 635-8198\n", "Uniform:No\n", "ADA:No\n", "Overview / Mission Statement:We are a Dual Language High School serving grade 7 to 12. College and career focused, we offer an Early College program starting in Grade 10; expeditionary learning inspired classrooms; music, visual arts, and graphic arts; sports and clubs for all grades; and a wealth of student and family support.\n", "Unique Features / Characteristics :The Margarita Muñiz Academy is dedicated to full cultural and linguistic fluency in Spanish and English for all its students. Through deep partnerships with families and community, the school prepares students for higher education, careers, and civic leadership.\n", "School Quality Framework (SQF): https://www.bostonpublicschools.org/Page/7849\n", "Dual Language:Spanish Dual-Language Program\n", "After School Program:Debate, Sports, Tutoring, and Clubs\n", "82\n", "41\n", "74\n", "103\n", "233\n", "65\n", "141\n", "37\n", "15\n", "17\n", "This data represents schools and programs for the 2025-26 school year and is subject to change. Great Starts would love your feedback: https://forms.gle/1kDr6byBat5HGHZF7\n", "\n" ] } ], "source": [ "print(content)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from goose3 import Goose\n", "\n", "g = Goose()\n", "article = g.extract(raw_html=html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['https://www.bostonpublicschools.org/Page/8277', 'https://www.bostonpublicschools.org/Page/7849']\n" ] } ], "source": [ "print(article.)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: readability-lxml in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (0.8.1)\n", "Requirement already satisfied: chardet in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from readability-lxml) (5.2.0)\n", "Requirement already satisfied: lxml in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from readability-lxml) (5.3.0)\n", "Requirement already satisfied: cssselect in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from readability-lxml) (1.2.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install readability-lxml" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from readability import Document\n", "import requests\n", "from lxml import html\n", "\n", "doc = Document(raw_html)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "clean_text = doc.summary(html_partial=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "
    \n" ] } ], "source": [ "print(clean_text)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: trafilatura in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (1.12.0)\n", "Requirement already satisfied: certifi in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (2023.11.17)\n", "Requirement already satisfied: courlan>=1.2.0 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (1.3.0)\n", "Requirement already satisfied: htmldate>=1.8.1 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (1.8.1)\n", "Requirement already satisfied: justext>=3.0.1 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (3.0.1)\n", "Requirement already satisfied: lxml>=5.2.2 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (5.3.0)\n", "Requirement already satisfied: charset-normalizer>=3.2.0 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (3.3.2)\n", "Requirement already satisfied: urllib3<3,>=1.26 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from trafilatura) (2.1.0)\n", "Requirement already satisfied: babel>=2.15.0 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from courlan>=1.2.0->trafilatura) (2.16.0)\n", "Requirement already satisfied: tld>=0.13 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from courlan>=1.2.0->trafilatura) (0.13)\n", "Requirement already satisfied: dateparser>=1.1.2 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from htmldate>=1.8.1->trafilatura) (1.2.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from htmldate>=1.8.1->trafilatura) (2.8.2)\n", "Requirement already satisfied: pytz in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (2024.1)\n", "Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (2023.12.25)\n", "Requirement already satisfied: tzlocal in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from dateparser>=1.1.2->htmldate>=1.8.1->trafilatura) (5.2)\n", "Requirement already satisfied: lxml-html-clean in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura) (0.2.0)\n", "Requirement already satisfied: six>=1.5 in /Users/mtwesley/.pyenv/versions/africansentiment/lib/python3.11/site-packages (from python-dateutil>=2.8.2->htmldate>=1.8.1->trafilatura) (1.16.0)\n", "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install trafilatura" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import trafilatura" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "clean_text = trafilatura.extract(raw_html)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20 Child St Jamaica Plain, MA 02130\n", "School / Provider Type:Boston Public School\n", "Ages / Grades Served:7, 8, 9, 10, 11, 12\n", "School Quality Tier:4\n", "This school has eligibility requirements. Check eligibility with a student to see if they qualify.\n", "Hours of Operation:8:30am - 3:30pm; 12:30pm Fri\n", "Phone:(617) 635-8198\n", "Email:munizacademy@bostonpublicschools.org\n", "School Leader:Dania Vazquez\n", "Transportation:https://www.bostonpublicschools.org/Page/8277\n", "Uniform:No\n", "ADA:No\n", "Overview / Mission Statement:We are a Dual Language High School serving grade 7 to 12. College and career focused, we offer an Early College program starting in Grade 10; expeditionary learning inspired classrooms; music, visual arts, and graphic arts; sports and clubs for all grades; and a wealth of student and family support.\n", "Unique Features / Characteristics :The Margarita Muñiz Academy is dedicated to full cultural and linguistic fluency in Spanish and English for all its students. Through deep partnerships with families and community, the school prepares students for higher education, careers, and civic leadership.\n", "School Quality Framework (SQF):https://www.bostonpublicschools.org/Page/7849\n", "Dual Language:Spanish Dual-Language Program\n", "Language Programming:Dual Language - Spanish, SLIFE Spanish\n", "School Profile:https://www.bostonpublicschools.org/Page/877\n", "Preview Session Date 1:11/20/2024, 9:00 AM - 10:30 AM In-Person\n", "Preview Session Date 2:12/17/2024, 9:30 AM - 11:00 AM In-Person\n", "Preview Session Date 3:1/22/2025, 1:00 AM - 2:30 AM In-Person\n", "After School Program:Debate, Sports, Tutoring, and Clubs\n", "Partners Link:https://www.partnerbps.org/\n", "846 Schools\n", "Sort By\n", "This data represents schools and programs for the 2025-26 school year and is subject to change. Great Starts would love your feedback: https://forms.gle/1kDr6byBat5HGHZF7\n" ] } ], "source": [ "print(clean_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "africansentiment", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 2 }