ReithBjarkan commited on
Commit
46c7082
Β·
1 Parent(s): 4c2bddb

Add primary keyword to CSV output with * marker

Browse files

- Add 'Primary Keyword' column to CSV output
- Include primary keyword as first row with similarity 1.000 and '*' marker
- Comparison keywords have empty string in Primary Keyword column
- Add comprehensive test suite to validate CSV structure
- Update documentation with test details

Files changed (3) hide show
  1. TEST_README.md +54 -0
  2. app.py +5 -1
  3. test_csv_output.py +146 -0
TEST_README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CSV Output Validation Tests
2
+
3
+ This test suite validates that the CSV output from the Keyword Cosine Similarity Tool includes the primary keyword with proper formatting.
4
+
5
+ ## Running the Tests
6
+
7
+ ```bash
8
+ python test_csv_output.py
9
+ ```
10
+
11
+ ## What the Tests Validate
12
+
13
+ ### Test 1: Basic CSV Structure
14
+ - βœ… All required columns are present: `Keyword`, `Cosine Similarity`, `Primary Keyword`
15
+ - βœ… Primary keyword appears in the first row
16
+ - βœ… Primary keyword has similarity value of exactly `1.000`
17
+ - βœ… Primary keyword has `*` marker in the `Primary Keyword` column
18
+ - βœ… All comparison keywords have empty string `""` in the `Primary Keyword` column
19
+ - βœ… CSV format is valid and correctly formatted
20
+
21
+ ### Test 2: Primary Keyword in Comparison List (Edge Case)
22
+ - βœ… When the primary keyword appears in the comparison list, it's still marked with `*` in the first row
23
+ - βœ… The duplicate entry in the comparison results has an empty `Primary Keyword` column
24
+ - βœ… Exactly one row has the `*` marker (the primary keyword row)
25
+
26
+ ### Test 3: Single Comparison Keyword
27
+ - βœ… Handles edge case of only one comparison keyword correctly
28
+ - βœ… Primary keyword row is present with `*` marker
29
+ - βœ… Comparison keyword has empty `Primary Keyword` column
30
+
31
+ ### Test 4: Column Order
32
+ - βœ… Columns are in the correct order: `Keyword`, `Cosine Similarity`, `Primary Keyword`
33
+
34
+ ### Test 5: Similarity Sorting
35
+ - βœ… All comparison keywords are sorted by similarity in descending order
36
+ - βœ… Primary keyword (1.000) always appears first
37
+
38
+ ## Expected CSV Output Format
39
+
40
+ ```csv
41
+ Keyword,Cosine Similarity,Primary Keyword
42
+ corporate cards,1.000,*
43
+ business corporate cards,0.8677525,
44
+ card corporate,0.8351246,
45
+ corporate card,0.83510983,
46
+ ...
47
+ ```
48
+
49
+ ## Key Validation Points
50
+
51
+ 1. **Primary Keyword Row**: Always first row, similarity = 1.000, `Primary Keyword` = `*`
52
+ 2. **Comparison Rows**: Empty string `""` in `Primary Keyword` column
53
+ 3. **Parser-Friendly**: Scripts can easily find primary keyword by searching for `*` marker
54
+ 4. **Self-Contained**: CSV includes all necessary information without external context
app.py CHANGED
@@ -94,8 +94,12 @@ if st.button("Calculate Similarities"):
94
 
95
  # Sort results by cosine similarity
96
  st.info("Sorting results...")
97
- results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
98
  sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
 
 
 
 
99
 
100
  # Display results
101
  st.header("Results")
 
94
 
95
  # Sort results by cosine similarity
96
  st.info("Sorting results...")
97
+ results = [{"Keyword": kw, "Cosine Similarity": sim, "Primary Keyword": ""} for kw, sim in zip(keyword_list, similarities)]
98
  sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
99
+
100
+ # Add primary keyword row at the top with "*" marker
101
+ primary_row = {"Keyword": primary_keyword, "Cosine Similarity": 1.000, "Primary Keyword": "*"}
102
+ sorted_results.insert(0, primary_row)
103
 
104
  # Display results
105
  st.header("Results")
test_csv_output.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+
7
+ def generate_results(primary_keyword, keyword_list, model_name="sentence-transformers/all-MiniLM-L6-v2"):
8
+ """
9
+ Simulates the results generation logic from app.py
10
+ This function extracts the core logic to test CSV output structure
11
+ """
12
+ # Load model
13
+ model = SentenceTransformer(model_name)
14
+
15
+ # Generate embeddings (without convert_to_tensor to avoid device issues)
16
+ primary_embedding = model.encode(primary_keyword)
17
+ keyword_embeddings = model.encode(keyword_list)
18
+
19
+ # Calculate cosine similarities
20
+ similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
21
+
22
+ # Sort results by cosine similarity
23
+ results = [{"Keyword": kw, "Cosine Similarity": sim, "Primary Keyword": ""} for kw, sim in zip(keyword_list, similarities)]
24
+ sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
25
+
26
+ # Add primary keyword row at the top with "*" marker
27
+ primary_row = {"Keyword": primary_keyword, "Cosine Similarity": 1.000, "Primary Keyword": "*"}
28
+ sorted_results.insert(0, primary_row)
29
+
30
+ return sorted_results
31
+
32
+
33
+ def test_csv_structure():
34
+ """Test that CSV output has correct structure"""
35
+ print("Running CSV structure tests...\n")
36
+
37
+ # Test 1: Basic functionality
38
+ print("Test 1: Basic CSV structure")
39
+ primary = "corporate cards"
40
+ keywords = ["business cards", "credit cards", "debit cards"]
41
+ results = generate_results(primary, keywords)
42
+ df = pd.DataFrame(results)
43
+ csv_output = df.to_csv(index=False)
44
+
45
+ # Validate columns
46
+ assert "Primary Keyword" in df.columns, "❌ Missing 'Primary Keyword' column"
47
+ assert "Keyword" in df.columns, "❌ Missing 'Keyword' column"
48
+ assert "Cosine Similarity" in df.columns, "❌ Missing 'Cosine Similarity' column"
49
+ print("βœ… All required columns present")
50
+
51
+ # Validate primary keyword row is first
52
+ assert df.iloc[0]["Keyword"] == primary, f"❌ Primary keyword not first row. Got: {df.iloc[0]['Keyword']}"
53
+ assert df.iloc[0]["Cosine Similarity"] == 1.000, f"❌ Primary keyword similarity not 1.000. Got: {df.iloc[0]['Cosine Similarity']}"
54
+ assert df.iloc[0]["Primary Keyword"] == "*", f"❌ Primary keyword marker not '*'. Got: {df.iloc[0]['Primary Keyword']}"
55
+ print(f"βœ… Primary keyword row correctly formatted: {df.iloc[0]['Keyword']}, {df.iloc[0]['Cosine Similarity']}, {df.iloc[0]['Primary Keyword']}")
56
+
57
+ # Validate comparison keywords have empty Primary Keyword column
58
+ for idx in range(1, len(df)):
59
+ assert df.iloc[idx]["Primary Keyword"] == "", f"❌ Comparison keyword at row {idx+1} should have empty Primary Keyword. Got: '{df.iloc[idx]['Primary Keyword']}'"
60
+ print(f"βœ… All {len(df)-1} comparison keywords have empty Primary Keyword column")
61
+
62
+ # Validate CSV format
63
+ lines = csv_output.strip().split('\n')
64
+ assert lines[0] == "Keyword,Cosine Similarity,Primary Keyword", f"❌ Incorrect CSV header. Got: {lines[0]}"
65
+ assert lines[1].startswith(f"{primary},1.0"), f"❌ Primary keyword row format incorrect. Got: {lines[1]}"
66
+ assert "*" in lines[1], f"❌ Missing '*' marker in primary keyword row. Got: {lines[1]}"
67
+ print("βœ… CSV format is correct")
68
+
69
+ print("\n" + "="*60 + "\n")
70
+
71
+ # Test 2: Primary keyword in comparison list (edge case)
72
+ print("Test 2: Primary keyword appears in comparison list")
73
+ primary2 = "corporate cards"
74
+ keywords2 = ["corporate cards", "business cards", "credit cards"] # Primary is in list
75
+ results2 = generate_results(primary2, keywords2)
76
+ df2 = pd.DataFrame(results2)
77
+
78
+ # Should have primary keyword row first with *
79
+ assert df2.iloc[0]["Keyword"] == primary2, "❌ Primary keyword not first"
80
+ assert df2.iloc[0]["Primary Keyword"] == "*", "❌ Primary keyword marker missing"
81
+
82
+ # Should also have the duplicate in comparison results (with empty Primary Keyword)
83
+ primary_count = (df2["Keyword"] == primary2).sum()
84
+ assert primary_count >= 2, "❌ Primary keyword should appear at least twice (once with *, once without)"
85
+
86
+ # Count rows with * marker (should be exactly 1)
87
+ marker_count = (df2["Primary Keyword"] == "*").sum()
88
+ assert marker_count == 1, f"❌ Should have exactly 1 '*' marker. Got: {marker_count}"
89
+ print(f"βœ… Primary keyword correctly marked with '*', duplicate entries handled properly")
90
+ print(f" Total rows: {len(df2)}, Rows with '*': {marker_count}, Primary keyword occurrences: {primary_count}")
91
+
92
+ print("\n" + "="*60 + "\n")
93
+
94
+ # Test 3: Single comparison keyword
95
+ print("Test 3: Single comparison keyword")
96
+ primary3 = "test keyword"
97
+ keywords3 = ["test comparison"]
98
+ results3 = generate_results(primary3, keywords3)
99
+ df3 = pd.DataFrame(results3)
100
+
101
+ assert len(df3) == 2, f"❌ Should have 2 rows (1 primary + 1 comparison). Got: {len(df3)}"
102
+ assert df3.iloc[0]["Primary Keyword"] == "*", "❌ Primary keyword marker missing"
103
+ assert df3.iloc[1]["Primary Keyword"] == "", "❌ Comparison keyword should have empty Primary Keyword"
104
+ print("βœ… Single comparison keyword handled correctly")
105
+
106
+ print("\n" + "="*60 + "\n")
107
+
108
+ # Test 4: Column order
109
+ print("Test 4: Column order")
110
+ expected_order = ["Keyword", "Cosine Similarity", "Primary Keyword"]
111
+ actual_order = list(df.columns)
112
+ assert actual_order == expected_order, f"❌ Column order incorrect. Expected: {expected_order}, Got: {actual_order}"
113
+ print(f"βœ… Column order correct: {actual_order}")
114
+
115
+ print("\n" + "="*60 + "\n")
116
+
117
+ # Test 5: Similarity sorting
118
+ print("Test 5: Results are sorted by similarity (descending)")
119
+ primary4 = "corporate cards"
120
+ keywords4 = ["business cards", "credit cards", "debit cards", "office supplies"]
121
+ results4 = generate_results(primary4, keywords4)
122
+ df4 = pd.DataFrame(results4)
123
+
124
+ # Skip first row (primary keyword with 1.000) and check rest are sorted
125
+ similarities = df4.iloc[1:]["Cosine Similarity"].values
126
+ is_sorted = all(similarities[i] >= similarities[i+1] for i in range(len(similarities)-1))
127
+ assert is_sorted, "❌ Comparison keywords not sorted by similarity (descending)"
128
+ print("βœ… Results correctly sorted by similarity (descending)")
129
+
130
+ print("\n" + "="*60 + "\n")
131
+
132
+ print("βœ… ALL TESTS PASSED! CSV output structure is correct.\n")
133
+ return True
134
+
135
+
136
+ if __name__ == "__main__":
137
+ try:
138
+ test_csv_structure()
139
+ except AssertionError as e:
140
+ print(f"\n❌ TEST FAILED: {e}")
141
+ exit(1)
142
+ except Exception as e:
143
+ print(f"\n❌ ERROR: {e}")
144
+ import traceback
145
+ traceback.print_exc()
146
+ exit(1)