Spaces:
Sleeping
Sleeping
Alexander Casimir Fischer
commited on
Commit
·
3e72a37
1
Parent(s):
11d5492
new file: __pycache__/common.cpython-311.pyc
Browse filesnew file: __pycache__/main.cpython-311.pyc
new file: common.py
new file: main.py
new file: qc_run.py
new file: requirements.txt
- __pycache__/common.cpython-311.pyc +0 -0
- __pycache__/main.cpython-311.pyc +0 -0
- common.py +301 -0
- main.py +174 -0
- qc_run.py +118 -0
- requirements.txt +0 -0
__pycache__/common.cpython-311.pyc
ADDED
Binary file (16 kB). View file
|
|
__pycache__/main.cpython-311.pyc
ADDED
Binary file (17.3 kB). View file
|
|
common.py
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Central storage for variables, objects, templates used by both apps
|
2 |
+
import os
|
3 |
+
|
4 |
+
#Importing dependencies
|
5 |
+
from langchain.chat_models import ChatOpenAI
|
6 |
+
from langchain.prompts import PromptTemplate
|
7 |
+
from langchain.chains import LLMChain
|
8 |
+
from langchain.tools import WikipediaQueryRun
|
9 |
+
from langchain.utilities import WikipediaAPIWrapper
|
10 |
+
|
11 |
+
#Prompt variables dictionary: Definitions of 4th grade Common Core Standards according to source provided by Crossover
|
12 |
+
standard_definition_dict = {
|
13 |
+
"CCSS.ELA-LITERACY.W.4.1":"Write opinion pieces on topics or texts, supporting a point of view with reasons and information",
|
14 |
+
"CCSS.ELA-LITERACY.W.4.2":"Write informative/explanatory texts to examine a topic and convey ideas and information clearly",
|
15 |
+
"CCSS.ELA-LITERACY.W.4.3":"Write narratives to develop real or imagined experiences or events using effective technique, descriptive details, and clear event sequences",
|
16 |
+
"CCSS.ELA-LITERACY.W.4.4":"Produce clear and coherent writing in which the development and organization are appropriate to task, purpose, and audience",
|
17 |
+
"CCSS.ELA-LITERACY.W.4.5":"Develop and strengthen writing as needed by planning, revising, and editing - with guidance and support from peers and adults",
|
18 |
+
"CCSS.ELA-LITERACY.W.4.6":"Use technology, including the Internet, to produce and publish writing as well as to interact and collaborate with others; demonstrate sufficient command of keyboarding skills to type a minimum of one page in a single sitting - with some guidance and support from adults",
|
19 |
+
"CCSS.ELA-LITERACY.W.4.7":"Conduct short research projects that build knowledge through investigation of different aspects of a topic",
|
20 |
+
"CCSS.ELA-LITERACY.W.4.8":"Recall relevant information from experiences or gather relevant information from print and digital sources; take notes and categorize information, and provide a list of sources",
|
21 |
+
"CCSS.ELA-LITERACY.W.4.9":"Draw evidence from literary or informational texts to support analysis, reflection, and research",
|
22 |
+
"CCSS.ELA-LITERACY.W.4.10":"Write routinely over extended time frames (time for research, reflection, and revision) and shorter time frames (a single sitting or a day or two) for a range of discipline-specific tasks, purposes, and audiences"
|
23 |
+
}
|
24 |
+
|
25 |
+
#Prompt variables dictionary: Definitions on how the AI's FRQ output must be formulated depending on the CCS selected,
|
26 |
+
#since a FRQ is not always a question in the grammatical sense
|
27 |
+
question_or_task_dict = {
|
28 |
+
"CCSS.ELA-LITERACY.W.4.1": "writing task",
|
29 |
+
"CCSS.ELA-LITERACY.W.4.2": "writing task",
|
30 |
+
"CCSS.ELA-LITERACY.W.4.3": "creative writing task",
|
31 |
+
"CCSS.ELA-LITERACY.W.4.4": "writing task. Also define the purpose and the supposed audience of the student's writing",
|
32 |
+
"CCSS.ELA-LITERACY.W.4.5": "writing task",
|
33 |
+
"CCSS.ELA-LITERACY.W.4.6": "research and writing task. The student may use the internet for research and ask adults for some guidance",
|
34 |
+
"CCSS.ELA-LITERACY.W.4.7": "research and writing task with the goal of expanding the students knowledge",
|
35 |
+
"CCSS.ELA-LITERACY.W.4.8": "scientific writing task",
|
36 |
+
"CCSS.ELA-LITERACY.W.4.9": "free response question",
|
37 |
+
"CCSS.ELA-LITERACY.W.4.10": "homework writing task"
|
38 |
+
}
|
39 |
+
|
40 |
+
#Rubric dictionary: chooses the rubric to displayed in expander after the evaluation
|
41 |
+
rubric_dict = {
|
42 |
+
"CCSS.ELA-LITERACY.W.4.1":"""
|
43 |
+
• Write opinion pieces on topics or texts,
|
44 |
+
supporting a point of view with reasons and information.
|
45 |
+
• Introduce a topic or text clearly, state an opinion, and
|
46 |
+
create an organizational structure in which related ideas
|
47 |
+
are grouped to support the writer's purpose.
|
48 |
+
• Provide reasons that are supported by facts and details.
|
49 |
+
• Link opinion and reasons using words and phrases (e.g.,
|
50 |
+
for instance, in order to, in addition).
|
51 |
+
• Provide a concluding statement or section related to the
|
52 |
+
opinion presented.
|
53 |
+
""",
|
54 |
+
"CCSS.ELA-LITERACY.W.4.2":"""
|
55 |
+
• Write informative/explanatory texts to examine a topic
|
56 |
+
and convey ideas and information clearly.
|
57 |
+
• Introduce a topic clearly and group related information
|
58 |
+
in paragraphs and sections; include formatting (e.g.,
|
59 |
+
headings), illustrations, and multimedia when useful to
|
60 |
+
aiding comprehension.
|
61 |
+
• Develop the topic with facts, definitions, concrete
|
62 |
+
details, quotations, or other information and examples
|
63 |
+
related to the topic.
|
64 |
+
• Link ideas within categories of information using words
|
65 |
+
and phrases (e.g., another, for example, also, because).
|
66 |
+
• Use precise language and domain-specific vocabulary to
|
67 |
+
inform about or explain the topic.
|
68 |
+
• Provide a concluding statement or section related to the
|
69 |
+
information or explanation presented.
|
70 |
+
""",
|
71 |
+
"CCSS.ELA-LITERACY.W.4.3":"""
|
72 |
+
• Write narratives to develop real or imagined experiences
|
73 |
+
or events using effective technique, descriptive details,
|
74 |
+
and clear event sequences.
|
75 |
+
• Orient the reader by establishing a situation and
|
76 |
+
introducing a narrator and/or characters; organize an
|
77 |
+
event sequence that unfolds naturally.
|
78 |
+
• Use dialogue and description to develop experiences and
|
79 |
+
events or show the responses of characters to situations.
|
80 |
+
• Use a variety of transitional words and phrases to manage
|
81 |
+
the sequence of events.
|
82 |
+
• Use concrete words and phrases and sensory details to
|
83 |
+
convey experiences and events precisely.
|
84 |
+
• Provide a conclusion that follows from the narrated
|
85 |
+
experiences or events.
|
86 |
+
""",
|
87 |
+
"CCSS.ELA-LITERACY.W.4.4":"""
|
88 |
+
Produce clear and coherent writing in which the development
|
89 |
+
and organization are appropriate to task, purpose, and
|
90 |
+
audience. (Grade-specific expectations for writing types
|
91 |
+
are defined in standards 1-3 above.)
|
92 |
+
""",
|
93 |
+
"CCSS.ELA-LITERACY.W.4.5":"""
|
94 |
+
With guidance and support from peers and adults, develop
|
95 |
+
and strengthen writing as needed by planning, revising,
|
96 |
+
and editing. (Editing for conventions should demonstrate
|
97 |
+
command of Language standards 1-3 up to and including
|
98 |
+
grade 4 here.)
|
99 |
+
""",
|
100 |
+
"CCSS.ELA-LITERACY.W.4.6":"""
|
101 |
+
With some guidance and support from adults, use technology,
|
102 |
+
including the Internet, to produce and publish writing as
|
103 |
+
well as to interact and collaborate with others;
|
104 |
+
demonstrate sufficient command of keyboarding skills to
|
105 |
+
type a minimum of one page in a single sitting.
|
106 |
+
""",
|
107 |
+
"CCSS.ELA-LITERACY.W.4.7":"""
|
108 |
+
Conduct short research projects that build knowledge
|
109 |
+
through investigation of different aspects of a topic.
|
110 |
+
""",
|
111 |
+
"CCSS.ELA-LITERACY.W.4.8":"""
|
112 |
+
Recall relevant information from experiences or gather
|
113 |
+
relevant information from print and digital sources; take
|
114 |
+
notes and categorize information, and provide a list of
|
115 |
+
sources.
|
116 |
+
""",
|
117 |
+
"CCSS.ELA-LITERACY.W.4.9":"""
|
118 |
+
• Draw evidence from literary or informational texts to
|
119 |
+
support analysis, reflection, and research.
|
120 |
+
• Apply grade 4 Reading standards to literature (e.g.,
|
121 |
+
"Describe in depth a character, setting, or event in a
|
122 |
+
story or drama, drawing on specific details in the text
|
123 |
+
[e.g., a character's thoughts, words, or actions].").
|
124 |
+
• Apply grade 4 Reading standards to informational texts
|
125 |
+
(e.g., "Explain how an author uses reasons and evidence
|
126 |
+
to support particular points in a text").
|
127 |
+
""",
|
128 |
+
"CCSS.ELA-LITERACY.W.4.10":"""
|
129 |
+
Write routinely over extended time frames (time for
|
130 |
+
research, reflection, and revision) and shorter time
|
131 |
+
frames (a single sitting or a day or two) for a range of
|
132 |
+
discipline-specific tasks, purposes, and audiences
|
133 |
+
"""
|
134 |
+
}
|
135 |
+
|
136 |
+
#Prompt templates: where the actual prompt engineering is happening. See google doc for further reference
|
137 |
+
prompt_context = PromptTemplate(
|
138 |
+
input_variables=["chosen_topic", "wikitext"],
|
139 |
+
template="You are a writer at a school book publishing company. \
|
140 |
+
You will be given a certain topic, then your task is to write a school book article about it. \
|
141 |
+
Choose your words in accordance to the teachings of Wikipedia, precisely the explanations you find at the end of this prompt. \
|
142 |
+
Please do only include full sentences in your answer. \
|
143 |
+
The article should have a clearly defined thread that is easy to follow. \
|
144 |
+
Please do not give the article a title, only include the body in your answer. \
|
145 |
+
Please write in a style that is fun to read and understandable by 12 year old kids. \
|
146 |
+
The article should be around 10-15 sentences long. \n\
|
147 |
+
\n\
|
148 |
+
Topic: {chosen_topic} \n\
|
149 |
+
\n\
|
150 |
+
Wikipedia: \n\
|
151 |
+
{wikitext}"
|
152 |
+
)
|
153 |
+
prompt_frq = PromptTemplate(
|
154 |
+
input_variables=["context", "standard_definition", "question_or_task"],
|
155 |
+
template="You are a 4th grade school teacher. \
|
156 |
+
You will be given an input text, which is a short article on a certain topic. \
|
157 |
+
Thematizing the content of the input text and adressing a 4th grade student, you will then generate a {question_or_task}. \
|
158 |
+
Do not include a title such as 'Question:' or 'Writing task:' in your output. \
|
159 |
+
Your output must be formulated in such a way that it can be used to test a person's ability to {standard_definition}. \n\
|
160 |
+
\n\
|
161 |
+
Input text: \n\
|
162 |
+
\n\
|
163 |
+
{context}"
|
164 |
+
)
|
165 |
+
prompt_evaluation = PromptTemplate(
|
166 |
+
input_variables=["context", "rubric",
|
167 |
+
"frq", "chosen_answer"],
|
168 |
+
template="You are a 4th grade school teacher. \
|
169 |
+
You will be given four inputs: An article on a certain topic. \
|
170 |
+
A task or question related to said article, meant to be solved by a 4th grade student. \
|
171 |
+
The answer to said task or question, written by one of your students. \
|
172 |
+
A standard for evaluating said answer.\
|
173 |
+
Your output should be an objective evaluation of the student's answer, taking all inputs into account. \
|
174 |
+
Here are your inputs: \n\
|
175 |
+
\n\
|
176 |
+
The article: \n\
|
177 |
+
\n\
|
178 |
+
{context} \n\
|
179 |
+
\n\
|
180 |
+
The task or question: \n\
|
181 |
+
\n\
|
182 |
+
{frq} \n\
|
183 |
+
\n\
|
184 |
+
The student's answer: \n\
|
185 |
+
\n\
|
186 |
+
{chosen_answer} \n\
|
187 |
+
\n\
|
188 |
+
The standard:\
|
189 |
+
\n\
|
190 |
+
The student should be able to {rubric}. \n\
|
191 |
+
\n\
|
192 |
+
Now please take some time to formulate your output, by thoroughly looking at each input. Also think about whether \
|
193 |
+
the student has showcased the required ability or not. In other words, whether he passed or failed the test, and why. \
|
194 |
+
Structure your output like this: \n\
|
195 |
+
\n\
|
196 |
+
'PASSED' or 'FAILED'\n\
|
197 |
+
Evaluation / Explaination"
|
198 |
+
)
|
199 |
+
prompt_topic_rand = PromptTemplate(input_variables=["var"], template="{var} give me a random \
|
200 |
+
writing topic in one or maximum two words. Please nothing about sex, drugs, alcohol or violence.")
|
201 |
+
prompt_answer_good = PromptTemplate(input_variables=["context", "frq", "standard"],
|
202 |
+
template="You are a Senior Test Manager at a successful software company and a very smart person. \
|
203 |
+
Your job is to test a new educational software. \
|
204 |
+
Please read this text presented to you below, then answer the task or question that follows. \
|
205 |
+
Please do not simply copy whole sentences from the text. Just behave like you were back in junior high, \
|
206 |
+
where you always had the best English grades of the entire school. \
|
207 |
+
Please formulate your answer in a way, that shows off your ability to {standard}. \
|
208 |
+
The purpose of the software is to measure this ability in a student. \n\n\
|
209 |
+
Here is the text: \n\
|
210 |
+
{context} \n\n\
|
211 |
+
Here is the task or question: \n\
|
212 |
+
{frq} \n\n\
|
213 |
+
Please take some time to think, then give it your best shot.")
|
214 |
+
prompt_answer_bad = PromptTemplate(input_variables=["context", "frq"],
|
215 |
+
template="You are a 10 year old student taking an English exam. You are not particularly intelligent. \
|
216 |
+
Unfortunately you also have troubling focusing on tasks and you happen to get the worst English grades of the entire class. \
|
217 |
+
Quit reading and homework does just not feel natural for you. You do not even mind, since you are good at a lot of other things. \
|
218 |
+
You are, for example, an incredibly creative person and like to tell stories. \
|
219 |
+
Please read this text presented to you below, then answer the task or question that follows. \
|
220 |
+
\n\n\
|
221 |
+
Here is the text: \n\
|
222 |
+
{context} \n\n\
|
223 |
+
Here is the task or question: \n\
|
224 |
+
{frq} \n\n\
|
225 |
+
Please remember: you will NOT perform well on this task. Create a poorly formulated answer, \
|
226 |
+
using a bit of bad grammar and also make some logical mistakes. \
|
227 |
+
Clearly indicating that you do not possess the skills being tested.\
|
228 |
+
You must try hard but still fail at this exam.")
|
229 |
+
prompt_qc_run = PromptTemplate(input_variables=["context", "frq", "rubric", \
|
230 |
+
"answer_good", "evaluation_good", "answer_bad", "evaluation_bad"],
|
231 |
+
template="You are a Senior Test Manager with 15 years of experience at a successful software company. \
|
232 |
+
Your daily business is to test educational KI software. You also have a degree in linguistics and love logic puzzles. \
|
233 |
+
Please have a look at 7 pieces of text, which will be given to you at the end of this prompt. \
|
234 |
+
Here are the 7 descriptions: \
|
235 |
+
1. an article on a certain topic, given by the software \n\
|
236 |
+
2. a free-response question on this article, given by the software \n\
|
237 |
+
3. a certain educational standard rubric, that is used to evaluate the answer on this free-response question \n\
|
238 |
+
4. the answer to the free-response question, given by a strong 4th grade student \n\
|
239 |
+
5. the evaluation of the strong answer, given by the software \n\
|
240 |
+
6. the answer to the free-response question, given by a weak 4th grade student \n\
|
241 |
+
7. the evaluation of the weak answer, given by the software \n\
|
242 |
+
Your task today is the following: please have a critical look at the output of the software. \
|
243 |
+
Take your time on each of the 7 texts, then give critical feedback on any shortcomings of the software's KI. \
|
244 |
+
Give recommendations on how to further improve the quality of texts number 1., 2., 5. and 7., \
|
245 |
+
by fine-tuning the KI instructions or prompts. \
|
246 |
+
Please be rather critical.\n\n\
|
247 |
+
{context}\n\n\
|
248 |
+
{frq}\n\n\
|
249 |
+
{rubric}\n\n\
|
250 |
+
{answer_good}\n\n\
|
251 |
+
{evaluation_good}\n\n\
|
252 |
+
{answer_bad}\n\n\
|
253 |
+
{evaluation_bad}")
|
254 |
+
prompt_qc_grade = PromptTemplate(input_variables=["qc_report"],
|
255 |
+
template="You will be given a precise report that was written to evaluate a new software's performance. \
|
256 |
+
Take a good look at the report and decide on an overall evaluation grade that aligns with the entire report's sentiment. \
|
257 |
+
The grade should be a percentage (integer between 1 and 100 with a percent symbol) and correctly represent and support the report. \
|
258 |
+
Only give an integer and '%' as an output, nothing else.\n\n\
|
259 |
+
Here is the report:\n\n\
|
260 |
+
{qc_report}")
|
261 |
+
|
262 |
+
#Defining LLMs, sources and Chains
|
263 |
+
llm = ChatOpenAI(model="gpt-4", temperature=0.5)
|
264 |
+
precise = ChatOpenAI(model="gpt-4", temperature=0.0)
|
265 |
+
random = ChatOpenAI(model="gpt-4", temperature=0.9)
|
266 |
+
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
|
267 |
+
context_chain = LLMChain(llm=llm, prompt=prompt_context)
|
268 |
+
frq_chain = LLMChain(llm=llm, prompt=prompt_frq, )
|
269 |
+
evaluation_chain = LLMChain(llm=llm, prompt=prompt_evaluation)
|
270 |
+
qc_answer_good_chain = LLMChain(llm=precise, prompt=prompt_answer_good)
|
271 |
+
qc_answer_bad_chain = LLMChain(llm=random, prompt=prompt_answer_bad)
|
272 |
+
topic_rand = LLMChain(llm=random, prompt=prompt_topic_rand)
|
273 |
+
var="Please"
|
274 |
+
qc_run_chain = LLMChain(llm=precise, prompt=prompt_qc_run)
|
275 |
+
qc_grade_chain = LLMChain(llm=precise, prompt=prompt_qc_grade)
|
276 |
+
|
277 |
+
#Tiny helper function that is necessary to keep the wikipedia page metadata and/or
|
278 |
+
#unrelated following pages from spilling into the LLM's output
|
279 |
+
def trim_text(input_string):
|
280 |
+
keyword1 = "Summary:"
|
281 |
+
keyword2 = "Page:"
|
282 |
+
index1 = input_string.find(keyword1)
|
283 |
+
if index1 == -1:
|
284 |
+
new_string = input_string
|
285 |
+
else:
|
286 |
+
new_string = input_string[index1 + len(keyword1):].strip()
|
287 |
+
index2 = new_string.find(keyword2)
|
288 |
+
if index2 == -1:
|
289 |
+
return new_string
|
290 |
+
else:
|
291 |
+
return new_string[:index2].strip()
|
292 |
+
|
293 |
+
#Function for detecting the use of copy/pasted parts of the context as answer
|
294 |
+
def plagiate(context, answer):
|
295 |
+
if answer in context:
|
296 |
+
return True
|
297 |
+
for i in range(len(answer) - 31):
|
298 |
+
substring = answer[i:i + 30]
|
299 |
+
if substring in context:
|
300 |
+
return True
|
301 |
+
return False
|
main.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#To access my personal API key from operation system environment variables.
|
2 |
+
#Inside the HuggingFace Space, this will be substituted by HF's "Secret" variable option.
|
3 |
+
#Feel free to use the tool as is (via my personal API key) for the time of my evaluation.
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
|
7 |
+
#Loading Streamlit for GUI
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
#Session variables - saved locally to not run the whole script everytime a user input is entered
|
11 |
+
if True:
|
12 |
+
if "standard_set" not in st.session_state:
|
13 |
+
st.session_state.standard_set = False
|
14 |
+
if "topic_set" not in st.session_state:
|
15 |
+
st.session_state.topic_set = False
|
16 |
+
if "content_set" not in st.session_state:
|
17 |
+
st.session_state.content_set = False
|
18 |
+
if "answer_set" not in st.session_state:
|
19 |
+
st.session_state.answer_set = False
|
20 |
+
if "evaluated" not in st.session_state:
|
21 |
+
st.session_state.evaluated = False
|
22 |
+
if "chosen_standard" not in st.session_state:
|
23 |
+
st.session_state.chosen_standard = ""
|
24 |
+
if "standard_definition" not in st.session_state:
|
25 |
+
st.session_state.standard_definition = ""
|
26 |
+
if "question_or_task" not in st.session_state:
|
27 |
+
st.session_state.question_or_task = ""
|
28 |
+
if "chosen_topic" not in st.session_state:
|
29 |
+
st.session_state.chosen_topic = ""
|
30 |
+
if "wikitext" not in st.session_state:
|
31 |
+
st.session_state.wikitext = ""
|
32 |
+
if "context" not in st.session_state:
|
33 |
+
st.session_state.context = ""
|
34 |
+
if "frq" not in st.session_state:
|
35 |
+
st.session_state.frq = ""
|
36 |
+
if "chosen_answer" not in st.session_state:
|
37 |
+
st.session_state.chosen_answer = ""
|
38 |
+
answer_logged=False
|
39 |
+
|
40 |
+
|
41 |
+
#Importing variables, objects, templates from "common"
|
42 |
+
from common import llm, wikipedia #language models
|
43 |
+
from common import standard_definition_dict, question_or_task_dict, rubric_dict #dictionaries
|
44 |
+
from common import prompt_context, prompt_frq, prompt_evaluation #prompt templates
|
45 |
+
from common import context_chain, frq_chain, evaluation_chain #prompting chains
|
46 |
+
from common import trim_text, plagiate #custom functions
|
47 |
+
|
48 |
+
|
49 |
+
#Setting up streamlit UI, intro
|
50 |
+
st.set_page_config(page_title="FQR Generator", page_icon="🎓",
|
51 |
+
menu_items={"About":"Version 1.0 \n\n Not for commercial use.",
|
52 |
+
"Get help":"https://www.linkedin.com/in/alex-c-fischer"})
|
53 |
+
st.title("🎓Common Core FRQ Generator")
|
54 |
+
with st.sidebar:
|
55 |
+
st.title("Menu")
|
56 |
+
st.link_button(label="Admin", url="https://www.google.com")
|
57 |
+
st.link_button(label="Contact", url="https://www.linkedin.com/in/alex-c-fischer/")
|
58 |
+
st.write("This little tool automatically generates free-response questions (FQRs) \
|
59 |
+
to evaluate a 4th grade student's knowledge of a given Common Core Writing Standard \
|
60 |
+
by reading and writing on a topic of their own choice. \
|
61 |
+
After the FRQ is answered, an evaluation will be provided.")
|
62 |
+
st.write("(The language processing is done by an AI model, \
|
63 |
+
yet the facts are sourced from the topic's wikipedia page, to ensure hallucination-free and up-to-date content.)" )
|
64 |
+
|
65 |
+
|
66 |
+
#FRQ based on standard, student input and prompting engine
|
67 |
+
with st.form("standard_form"):
|
68 |
+
st.session_state.chosen_standard = st.selectbox(
|
69 |
+
"Choose 4th Grade Common Core Writing standard:",
|
70 |
+
("CCSS.ELA-LITERACY.W.4.1","CCSS.ELA-LITERACY.W.4.2","CCSS.ELA-LITERACY.W.4.3","CCSS.ELA-LITERACY.W.4.4",
|
71 |
+
"CCSS.ELA-LITERACY.W.4.5","CCSS.ELA-LITERACY.W.4.6","CCSS.ELA-LITERACY.W.4.7","CCSS.ELA-LITERACY.W.4.8",
|
72 |
+
"CCSS.ELA-LITERACY.W.4.9","CCSS.ELA-LITERACY.W.4.10")
|
73 |
+
)
|
74 |
+
st.session_state.standard_definition = standard_definition_dict[st.session_state.chosen_standard]
|
75 |
+
st.session_state.question_or_task = question_or_task_dict[st.session_state.chosen_standard]
|
76 |
+
subm_standard =st.form_submit_button("Set")
|
77 |
+
if subm_standard:
|
78 |
+
st.session_state.standard_set=True
|
79 |
+
st.write("We will test your ability to:")
|
80 |
+
st.write(f"📜{st.session_state.standard_definition}.")
|
81 |
+
if st.session_state.standard_set:
|
82 |
+
with st.form("topic_form"):
|
83 |
+
st.session_state.chosen_topic = st.text_input("Type in a topic of your interest, then click 'Submit'.")
|
84 |
+
subm_topic = st.form_submit_button("Submit")
|
85 |
+
if st.session_state.standard_set and subm_topic:
|
86 |
+
st.empty()
|
87 |
+
with st.spinner('🤖Browsing wikipedia...'):
|
88 |
+
if st.session_state.wikitext=="":
|
89 |
+
wikitext = trim_text(wikipedia.run(st.session_state.chosen_topic))
|
90 |
+
if wikitext=="No good Wikipedia Search Result was found":
|
91 |
+
st.write(f"🤖Sorry - I can't find anything in wikipedia on '{st.session_state.chosen_topic}'. \
|
92 |
+
I would love to make something up, but I can't do that in here. Please try something else.")
|
93 |
+
got_it = st.button("Got it")
|
94 |
+
st.session_state.topic_set=False
|
95 |
+
st.stop()
|
96 |
+
else:
|
97 |
+
st.session_state.wikitext = wikitext
|
98 |
+
st.session_state.topic_set=True
|
99 |
+
st.success("Article found")
|
100 |
+
with st.spinner('🤖So interesting! Now please give me a few seconds to create the context and FRQ.'):
|
101 |
+
if st.session_state.context=="":
|
102 |
+
st.session_state.context = context_chain.run(
|
103 |
+
chosen_topic=st.session_state.chosen_topic,
|
104 |
+
wikitext=st.session_state.wikitext
|
105 |
+
)
|
106 |
+
if st.session_state.frq=="":
|
107 |
+
st.session_state.frq = frq_chain.run(
|
108 |
+
context=st.session_state.context,
|
109 |
+
standard_definition=st.session_state.standard_definition,
|
110 |
+
question_or_task=st.session_state.question_or_task
|
111 |
+
)
|
112 |
+
st.success("Content and FRQ created")
|
113 |
+
if st.session_state.topic_set:
|
114 |
+
with st.form("content_form"):
|
115 |
+
st.write("🤖Here we go - that was quick, wasn't it?")
|
116 |
+
st.subheader("Context required to answer the FRQ:")
|
117 |
+
st.write(st.session_state.context)
|
118 |
+
st.subheader("Free Response Question:")
|
119 |
+
st.write(st.session_state.frq)
|
120 |
+
st.write("🤖Read all of the above? Great! Continue with the assignment at your own pace.")
|
121 |
+
next = st.form_submit_button("Continue")
|
122 |
+
if next:
|
123 |
+
st.session_state.content_set=True
|
124 |
+
if st.session_state.content_set:
|
125 |
+
with st.form("answer_form"):
|
126 |
+
st.session_state.chosen_answer = st.text_area("Type in your answer, then click 'Submit'. Please do not simply copy/paste from above.")
|
127 |
+
subm_answer = st.form_submit_button("Submit")
|
128 |
+
if st.session_state.content_set and subm_answer:
|
129 |
+
with st.spinner('🤖Logging...'):
|
130 |
+
pass
|
131 |
+
if plagiate(context=st.session_state.context, answer=st.session_state.chosen_answer):
|
132 |
+
st.session_state.content_set=False
|
133 |
+
st.write("🤖Using Crtl+C/P defeats the purpose of this test, young friend.")
|
134 |
+
time.sleep(0.1)
|
135 |
+
st.write("🤖Those are the rules. Please overwrite above answer in your own words - \
|
136 |
+
trust me, this is a great way to learn interesting new things.")
|
137 |
+
got_it = st.button("Got it")
|
138 |
+
st.stop()
|
139 |
+
else:
|
140 |
+
st.subheader("Answer submitted")
|
141 |
+
answer_logged=True
|
142 |
+
if answer_logged:
|
143 |
+
st.session_state.answer_set=True
|
144 |
+
if st.session_state.answer_set:
|
145 |
+
with st.form("evaluation_form"):
|
146 |
+
st.subheader("Evaluation")
|
147 |
+
with st.spinner("🤖Let me see how you did today."):
|
148 |
+
evaluation = evaluation_chain.run(
|
149 |
+
context=st.session_state.context,
|
150 |
+
rubric=rubric_dict[st.session_state.chosen_standard],
|
151 |
+
frq=st.session_state.frq,
|
152 |
+
chosen_answer=st.session_state.chosen_answer)
|
153 |
+
st.write(evaluation)
|
154 |
+
def clear_form():
|
155 |
+
st.session_state.answer_set=False
|
156 |
+
next
|
157 |
+
st.form_submit_button(label="Reformulate answer", on_click=clear_form)
|
158 |
+
st.empty()
|
159 |
+
with st.expander("Show Evaluation & Feedback Rubric"):
|
160 |
+
st.write("AI evaluated the student's ability to:")
|
161 |
+
st.text(rubric_dict[st.session_state.chosen_standard])
|
162 |
+
rerun = st.button("Rerun")
|
163 |
+
if rerun:
|
164 |
+
for key in st.session_state.keys():
|
165 |
+
del st.session_state[key]
|
166 |
+
st.rerun()
|
167 |
+
st.divider()
|
168 |
+
st.write("Admin area: clicking below will open a new app")
|
169 |
+
st.link_button(label="QC Test run - let GPT-4 take this test", url="https://www.google.com")
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
qc_run.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random as rand
|
3 |
+
import pandas as pd
|
4 |
+
import time
|
5 |
+
import datetime
|
6 |
+
import base64
|
7 |
+
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
#Importing variables, objects, templates from "common"
|
11 |
+
from common import llm, precise, random, wikipedia #language models
|
12 |
+
from common import standard_definition_dict, question_or_task_dict, rubric_dict #dictionaries
|
13 |
+
from common import prompt_answer_good, prompt_answer_bad, prompt_qc_run, prompt_qc_grade #prompt templates
|
14 |
+
from common import context_chain, frq_chain, evaluation_chain, topic_rand, var #prompting chains
|
15 |
+
from common import qc_answer_good_chain, qc_answer_bad_chain, qc_run_chain, qc_grade_chain #prompting chains
|
16 |
+
from common import trim_text #custom function
|
17 |
+
|
18 |
+
#script:
|
19 |
+
st.set_page_config(page_title="QC Test run FQR Generator", page_icon="⚙️",
|
20 |
+
menu_items={"About":"Version 1.0 \n\n Not for commercial use.",
|
21 |
+
"Get help":"https://www.linkedin.com/in/alex-c-fischer"})
|
22 |
+
st.title("Automatized QC Testing Script for Common Core FRQ Generator")
|
23 |
+
st.write("The original Test will now be taken by GPT-4.")
|
24 |
+
mode = st.radio("Choose Mode", ["Single Launch (+live generating)", "Serial Launch (+CSV-Download)"])
|
25 |
+
launch_qc = st.button("Launch")
|
26 |
+
if mode=="Single Launch (+live generating)" and launch_qc:
|
27 |
+
topic_qc = topic_rand.run(var=var)
|
28 |
+
numb_qc = rand.randint(1, 10)
|
29 |
+
standard_qc = "CCSS.ELA-LITERACY.W.4."+str(numb_qc)
|
30 |
+
st.divider()
|
31 |
+
st.subheader("Random topic:")
|
32 |
+
st.write(topic_qc)
|
33 |
+
st.subheader("Random CC standard:")
|
34 |
+
st.write(standard_qc)
|
35 |
+
standard_definition_qc = standard_definition_dict[standard_qc]
|
36 |
+
question_or_task_qc = question_or_task_dict[standard_qc]
|
37 |
+
wikitext_qc = trim_text(wikipedia.run(topic_qc))
|
38 |
+
st.divider()
|
39 |
+
st.subheader("Context:")
|
40 |
+
context_qc = context_chain.run(chosen_topic=topic_qc, wikitext=wikitext_qc)
|
41 |
+
st.write(context_qc)
|
42 |
+
st.divider()
|
43 |
+
st.subheader("Free Response Question:")
|
44 |
+
frq_qc = frq_chain.run(context=context_qc, standard_definition=standard_definition_qc, question_or_task=question_or_task_qc)
|
45 |
+
st.write(frq_qc)
|
46 |
+
st.divider()
|
47 |
+
st.subheader("Good Answer, according to GPT-4:")
|
48 |
+
answer_good_qc = qc_answer_good_chain.run(context=context_qc, frq=frq_qc, standard=standard_definition_qc)
|
49 |
+
st.write(answer_good_qc)
|
50 |
+
st.divider()
|
51 |
+
st.subheader("Evaluation on 'Good Answer':")
|
52 |
+
evaluation_good_qc = evaluation_chain.run(
|
53 |
+
context=context_qc, rubric=rubric_dict[standard_qc],
|
54 |
+
frq=frq_qc, chosen_answer=answer_good_qc
|
55 |
+
)
|
56 |
+
st.write(evaluation_good_qc)
|
57 |
+
st.divider()
|
58 |
+
st.subheader("Bad Answer, according to GPT-4:")
|
59 |
+
answer_bad_qc = qc_answer_bad_chain.run(context=context_qc, frq=frq_qc, standard=standard_definition_qc)
|
60 |
+
st.write(answer_bad_qc)
|
61 |
+
st.divider()
|
62 |
+
st.subheader("Evaluation on 'Bad Answer':")
|
63 |
+
evaluation_bad_qc = evaluation_chain.run(
|
64 |
+
context=context_qc, rubric=rubric_dict[standard_qc],
|
65 |
+
frq=frq_qc, chosen_answer=answer_bad_qc
|
66 |
+
)
|
67 |
+
st.write(evaluation_bad_qc)
|
68 |
+
st.divider()
|
69 |
+
st.subheader("Quality Control Report:")
|
70 |
+
qc_report = qc_run_chain.run(
|
71 |
+
context=context_qc, frq=frq_qc, rubric=rubric_dict[standard_qc],
|
72 |
+
answer_good=answer_good_qc, evaluation_good=evaluation_good_qc,
|
73 |
+
answer_bad=answer_bad_qc, evaluation_bad=evaluation_bad_qc)
|
74 |
+
st.write(qc_report)
|
75 |
+
with st.form("Overall Accuracy"):
|
76 |
+
st.header("Overall grading of generated content:")
|
77 |
+
qc_grade = qc_grade_chain.run(qc_report=qc_report)
|
78 |
+
st.header(qc_grade)
|
79 |
+
st.write("Want to save this run?")
|
80 |
+
st.write("Menu in upper right corner > Print > PDF")
|
81 |
+
st.form_submit_button("Clear All & Rerun")
|
82 |
+
if mode=="Serial Launch (+CSV-Download)":
|
83 |
+
batch = st.number_input("Number of reruns", min_value=1, max_value=20, value=1, step=1)
|
84 |
+
comment = st.text_input("Comment - note your prompt fine tunings here, to track and analyse their effects")
|
85 |
+
if launch_qc:
|
86 |
+
df = pd.DataFrame(columns=["Round", "Comment", "Standard", "Topic", "Context", "FRQ", "Good Answer", "Good Evaluation", "Bad Answer", "Bad Evaluation", "Quality Control Report", "Overall Accurancy"])
|
87 |
+
progress = st.progress(0)
|
88 |
+
for i in range(batch):
|
89 |
+
progress.progress((i + 1) / batch)
|
90 |
+
topic_qc = topic_rand.run(var=var)
|
91 |
+
numb_qc = rand.randint(1, 10)
|
92 |
+
standard_qc = "CCSS.ELA-LITERACY.W."+str(numb_qc)
|
93 |
+
standard_definition_qc = standard_definition_dict[standard_qc]
|
94 |
+
question_or_task_qc = question_or_task_dict[standard_qc]
|
95 |
+
wikitext_qc = trim_text(wikipedia.run(topic_qc))
|
96 |
+
context_qc = context_chain.run(chosen_topic=topic_qc, wikitext=wikitext_qc)
|
97 |
+
frq_qc = frq_chain.run(context=context_qc, standard_definition=standard_definition_qc, question_or_task=question_or_task_qc)
|
98 |
+
answer_good_qc = qc_answer_good_chain.run(context=context_qc, frq=frq_qc, standard=standard_definition_qc)
|
99 |
+
evaluation_good_qc = evaluation_chain.run(context=context_qc, standard_definition=standard_definition_qc, frq=frq_qc, chosen_answer=answer_good_qc)
|
100 |
+
answer_bad_qc = qc_answer_bad_chain.run(context=context_qc, frq=frq_qc, standard=standard_definition_qc)
|
101 |
+
evaluation_bad_qc = evaluation_chain.run(context=context_qc, standard_definition=standard_definition_qc, frq=frq_qc, chosen_answer=answer_bad_qc)
|
102 |
+
qc_report = qc_run_chain.run(context=context_qc, frq=frq_qc, standard_definition=standard_definition_qc, answer_good=answer_good_qc, evaluation_good=evaluation_good_qc, answer_bad=answer_bad_qc, evaluation_bad=evaluation_bad_qc)
|
103 |
+
qc_grade = qc_grade_chain.run(qc_report=qc_report)
|
104 |
+
df.loc[len(df.index)] = {"Round":i+1, "Comment":comment, "Standard":standard_qc,
|
105 |
+
"Topic":topic_qc, "Context":context_qc, "FRQ":frq_qc,
|
106 |
+
"Good Answer":answer_good_qc, "Good Evaluation":evaluation_good_qc,
|
107 |
+
"Bad Answer":answer_bad_qc, "Bad Evaluation":evaluation_bad_qc,
|
108 |
+
"Quality Control Report":qc_report, "Overall Accurancy":qc_grade}
|
109 |
+
time.sleep(0.1)
|
110 |
+
progress.empty()
|
111 |
+
csv = df.to_csv(index=False)
|
112 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
113 |
+
now = datetime.datetime.now()
|
114 |
+
timestamp_str = now.strftime("%Y-%m-%d_%H-%M-%S")
|
115 |
+
filename = f"{timestamp_str}_testruns_{batch}_rows.csv"
|
116 |
+
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download Results CSV</a>'
|
117 |
+
st.markdown(href, unsafe_allow_html=True)
|
118 |
+
|
requirements.txt
ADDED
Binary file (11.7 kB). View file
|
|