friendshipkim commited on
Commit
9c00199
1 Parent(s): e899844

cache_data to cache

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -11,7 +11,7 @@ from annotated_text import annotated_text
11
 
12
  ORG_ID = "cornell-authorship"
13
 
14
- @st.cache_data
15
  def preprocess_text(s):
16
  return list(filter(lambda x: x!= '', (''.join(c if c.isalnum() or c == ' ' else ' ' for c in s)).split(' ')))
17
 
@@ -21,7 +21,7 @@ def get_pairwise_distances(model):
21
  df = pd.DataFrame(dataset).set_index('index')
22
  return df
23
 
24
- @st.cache_data
25
  def get_pairwise_distances_chunked(model, chunk):
26
  # for df in pd.read_csv(f"{ASSETS_PATH}/{model}/pairwise_distances.csv", chunksize = 16):
27
  # print(df.iloc[0]['queries'])
@@ -29,7 +29,7 @@ def get_pairwise_distances_chunked(model, chunk):
29
  # return df
30
  return get_pairwise_distances(model)
31
 
32
- @st.cache_data
33
  def get_query_strings():
34
  # df = pd.read_json(hf_hub_download(repo_id=repo_id, filename="IUR_Reddit_test_queries_english.jsonl"), lines = True)
35
  dataset = load_dataset(f"{ORG_ID}/IUR_Reddit_test_queries_english")["train"]
@@ -41,7 +41,7 @@ def get_query_strings():
41
 
42
  # return pd.read_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_queries_english.parquet", columns=['fullText', 'index', 'authorIDs'])
43
 
44
- @st.cache_data
45
  def get_candidate_strings():
46
  # df = pd.read_json(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.jsonl", lines = True)
47
  dataset = load_dataset(f"{ORG_ID}/IUR_Reddit_test_candidates_english")["train"]
@@ -52,28 +52,28 @@ def get_candidate_strings():
52
  # df.to_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.parquet", index = 'index', partition_cols = 'partition')
53
  # return pd.read_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.parquet", columns=['fullText', 'index', 'authorIDs'])
54
 
55
- @st.cache_data
56
  def get_embedding_dataset(model):
57
  # data = load_from_disk(f"{ASSETS_PATH}/{model}/embedding")
58
  data = load_dataset(f"{ORG_ID}/{model}_embedding")
59
  return data
60
 
61
- @st.cache_data
62
  def get_bad_queries(model):
63
  df = get_query_strings().iloc[list(get_pairwise_distances(model)['queries'].unique())][['fullText', 'index', 'authorIDs']]
64
  return df
65
 
66
- @st.cache_data
67
  def get_gt_candidates(model, author):
68
  gt_candidates = get_candidate_strings()
69
  df = gt_candidates[gt_candidates['authorIDs'].apply(lambda x: x[0]) == author]
70
  return df
71
 
72
- @st.cache_data
73
  def get_candidate_text(l):
74
  return get_candidate_strings().at[l,'fullText']
75
 
76
- @st.cache_data
77
  def get_annotated_text(text, word, pos):
78
  # print("here", word, pos)
79
  start= text.index(word, pos)
 
11
 
12
  ORG_ID = "cornell-authorship"
13
 
14
+ @st.cache
15
  def preprocess_text(s):
16
  return list(filter(lambda x: x!= '', (''.join(c if c.isalnum() or c == ' ' else ' ' for c in s)).split(' ')))
17
 
 
21
  df = pd.DataFrame(dataset).set_index('index')
22
  return df
23
 
24
+ @st.cache
25
  def get_pairwise_distances_chunked(model, chunk):
26
  # for df in pd.read_csv(f"{ASSETS_PATH}/{model}/pairwise_distances.csv", chunksize = 16):
27
  # print(df.iloc[0]['queries'])
 
29
  # return df
30
  return get_pairwise_distances(model)
31
 
32
+ @st.cache
33
  def get_query_strings():
34
  # df = pd.read_json(hf_hub_download(repo_id=repo_id, filename="IUR_Reddit_test_queries_english.jsonl"), lines = True)
35
  dataset = load_dataset(f"{ORG_ID}/IUR_Reddit_test_queries_english")["train"]
 
41
 
42
  # return pd.read_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_queries_english.parquet", columns=['fullText', 'index', 'authorIDs'])
43
 
44
+ @st.cache
45
  def get_candidate_strings():
46
  # df = pd.read_json(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.jsonl", lines = True)
47
  dataset = load_dataset(f"{ORG_ID}/IUR_Reddit_test_candidates_english")["train"]
 
52
  # df.to_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.parquet", index = 'index', partition_cols = 'partition')
53
  # return pd.read_parquet(f"{ASSETS_PATH}/IUR_Reddit_test_candidates_english.parquet", columns=['fullText', 'index', 'authorIDs'])
54
 
55
+ @st.cache
56
  def get_embedding_dataset(model):
57
  # data = load_from_disk(f"{ASSETS_PATH}/{model}/embedding")
58
  data = load_dataset(f"{ORG_ID}/{model}_embedding")
59
  return data
60
 
61
+ @st.cache
62
  def get_bad_queries(model):
63
  df = get_query_strings().iloc[list(get_pairwise_distances(model)['queries'].unique())][['fullText', 'index', 'authorIDs']]
64
  return df
65
 
66
+ @st.cache
67
  def get_gt_candidates(model, author):
68
  gt_candidates = get_candidate_strings()
69
  df = gt_candidates[gt_candidates['authorIDs'].apply(lambda x: x[0]) == author]
70
  return df
71
 
72
+ @st.cache
73
  def get_candidate_text(l):
74
  return get_candidate_strings().at[l,'fullText']
75
 
76
+ @st.cache
77
  def get_annotated_text(text, word, pos):
78
  # print("here", word, pos)
79
  start= text.index(word, pos)