Spaces:

ReithBjarkan
/

SEO_Keyword_Similarity_Tool

Running

ReithBjarkan commited on Nov 4

Commit

46c7082

1 Parent(s): 4c2bddb

Add primary keyword to CSV output with * marker

- Add 'Primary Keyword' column to CSV output
- Include primary keyword as first row with similarity 1.000 and '*' marker
- Comparison keywords have empty string in Primary Keyword column
- Add comprehensive test suite to validate CSV structure
- Update documentation with test details

Files changed (3) hide show

TEST_README.md +54 -0
app.py +5 -1
test_csv_output.py +146 -0

TEST_README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# CSV Output Validation Tests
+This test suite validates that the CSV output from the Keyword Cosine Similarity Tool includes the primary keyword with proper formatting.
+## Running the Tests
+```bash
+python test_csv_output.py
+```
+## What the Tests Validate
+### Test 1: Basic CSV Structure
+- ✅ All required columns are present: `Keyword`, `Cosine Similarity`, `Primary Keyword`
+- ✅ Primary keyword appears in the first row
+- ✅ Primary keyword has similarity value of exactly `1.000`
+- ✅ Primary keyword has `*` marker in the `Primary Keyword` column
+- ✅ All comparison keywords have empty string `""` in the `Primary Keyword` column
+- ✅ CSV format is valid and correctly formatted
+### Test 2: Primary Keyword in Comparison List (Edge Case)
+- ✅ When the primary keyword appears in the comparison list, it's still marked with `*` in the first row
+- ✅ The duplicate entry in the comparison results has an empty `Primary Keyword` column
+- ✅ Exactly one row has the `*` marker (the primary keyword row)
+### Test 3: Single Comparison Keyword
+- ✅ Handles edge case of only one comparison keyword correctly
+- ✅ Primary keyword row is present with `*` marker
+- ✅ Comparison keyword has empty `Primary Keyword` column
+### Test 4: Column Order
+- ✅ Columns are in the correct order: `Keyword`, `Cosine Similarity`, `Primary Keyword`
+### Test 5: Similarity Sorting
+- ✅ All comparison keywords are sorted by similarity in descending order
+- ✅ Primary keyword (1.000) always appears first
+## Expected CSV Output Format
+```csv
+Keyword,Cosine Similarity,Primary Keyword
+corporate cards,1.000,*
+business corporate cards,0.8677525,
+card corporate,0.8351246,
+corporate card,0.83510983,
+...
+```
+## Key Validation Points
+1. **Primary Keyword Row**: Always first row, similarity = 1.000, `Primary Keyword` = `*`
+2. **Comparison Rows**: Empty string `""` in `Primary Keyword` column
+3. **Parser-Friendly**: Scripts can easily find primary keyword by searching for `*` marker
+4. **Self-Contained**: CSV includes all necessary information without external context

app.py CHANGED Viewed

@@ -94,8 +94,12 @@ if st.button("Calculate Similarities"):
         # Sort results by cosine similarity
         st.info("Sorting results...")
-        results = [{"Keyword": kw, "Cosine Similarity": sim} for kw, sim in zip(keyword_list, similarities)]
         sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
         # Display results
         st.header("Results")

         # Sort results by cosine similarity
         st.info("Sorting results...")
+        results = [{"Keyword": kw, "Cosine Similarity": sim, "Primary Keyword": ""} for kw, sim in zip(keyword_list, similarities)]
         sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
+        # Add primary keyword row at the top with "*" marker
+        primary_row = {"Keyword": primary_keyword, "Cosine Similarity": 1.000, "Primary Keyword": "*"}
+        sorted_results.insert(0, primary_row)
         # Display results
         st.header("Results")

test_csv_output.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+def generate_results(primary_keyword, keyword_list, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    """
+    Simulates the results generation logic from app.py
+    This function extracts the core logic to test CSV output structure
+    """
+    # Load model
+    model = SentenceTransformer(model_name)
+    # Generate embeddings (without convert_to_tensor to avoid device issues)
+    primary_embedding = model.encode(primary_keyword)
+    keyword_embeddings = model.encode(keyword_list)
+    # Calculate cosine similarities
+    similarities = cosine_similarity([primary_embedding], keyword_embeddings)[0]
+    # Sort results by cosine similarity
+    results = [{"Keyword": kw, "Cosine Similarity": sim, "Primary Keyword": ""} for kw, sim in zip(keyword_list, similarities)]
+    sorted_results = sorted(results, key=lambda x: x["Cosine Similarity"], reverse=True)
+    # Add primary keyword row at the top with "*" marker
+    primary_row = {"Keyword": primary_keyword, "Cosine Similarity": 1.000, "Primary Keyword": "*"}
+    sorted_results.insert(0, primary_row)
+    return sorted_results
+def test_csv_structure():
+    """Test that CSV output has correct structure"""
+    print("Running CSV structure tests...\n")
+    # Test 1: Basic functionality
+    print("Test 1: Basic CSV structure")
+    primary = "corporate cards"
+    keywords = ["business cards", "credit cards", "debit cards"]
+    results = generate_results(primary, keywords)
+    df = pd.DataFrame(results)
+    csv_output = df.to_csv(index=False)
+    # Validate columns
+    assert "Primary Keyword" in df.columns, "❌ Missing 'Primary Keyword' column"
+    assert "Keyword" in df.columns, "❌ Missing 'Keyword' column"
+    assert "Cosine Similarity" in df.columns, "❌ Missing 'Cosine Similarity' column"
+    print("✅ All required columns present")
+    # Validate primary keyword row is first
+    assert df.iloc[0]["Keyword"] == primary, f"❌ Primary keyword not first row. Got: {df.iloc[0]['Keyword']}"
+    assert df.iloc[0]["Cosine Similarity"] == 1.000, f"❌ Primary keyword similarity not 1.000. Got: {df.iloc[0]['Cosine Similarity']}"
+    assert df.iloc[0]["Primary Keyword"] == "*", f"❌ Primary keyword marker not '*'. Got: {df.iloc[0]['Primary Keyword']}"
+    print(f"✅ Primary keyword row correctly formatted: {df.iloc[0]['Keyword']}, {df.iloc[0]['Cosine Similarity']}, {df.iloc[0]['Primary Keyword']}")
+    # Validate comparison keywords have empty Primary Keyword column
+    for idx in range(1, len(df)):
+        assert df.iloc[idx]["Primary Keyword"] == "", f"❌ Comparison keyword at row {idx+1} should have empty Primary Keyword. Got: '{df.iloc[idx]['Primary Keyword']}'"
+    print(f"✅ All {len(df)-1} comparison keywords have empty Primary Keyword column")
+    # Validate CSV format
+    lines = csv_output.strip().split('\n')
+    assert lines[0] == "Keyword,Cosine Similarity,Primary Keyword", f"❌ Incorrect CSV header. Got: {lines[0]}"
+    assert lines[1].startswith(f"{primary},1.0"), f"❌ Primary keyword row format incorrect. Got: {lines[1]}"
+    assert "*" in lines[1], f"❌ Missing '*' marker in primary keyword row. Got: {lines[1]}"
+    print("✅ CSV format is correct")
+    print("\n" + "="*60 + "\n")
+    # Test 2: Primary keyword in comparison list (edge case)
+    print("Test 2: Primary keyword appears in comparison list")
+    primary2 = "corporate cards"
+    keywords2 = ["corporate cards", "business cards", "credit cards"]  # Primary is in list
+    results2 = generate_results(primary2, keywords2)
+    df2 = pd.DataFrame(results2)
+    # Should have primary keyword row first with *
+    assert df2.iloc[0]["Keyword"] == primary2, "❌ Primary keyword not first"
+    assert df2.iloc[0]["Primary Keyword"] == "*", "❌ Primary keyword marker missing"
+    # Should also have the duplicate in comparison results (with empty Primary Keyword)
+    primary_count = (df2["Keyword"] == primary2).sum()
+    assert primary_count >= 2, "❌ Primary keyword should appear at least twice (once with *, once without)"
+    # Count rows with * marker (should be exactly 1)
+    marker_count = (df2["Primary Keyword"] == "*").sum()
+    assert marker_count == 1, f"❌ Should have exactly 1 '*' marker. Got: {marker_count}"
+    print(f"✅ Primary keyword correctly marked with '*', duplicate entries handled properly")
+    print(f"   Total rows: {len(df2)}, Rows with '*': {marker_count}, Primary keyword occurrences: {primary_count}")
+    print("\n" + "="*60 + "\n")
+    # Test 3: Single comparison keyword
+    print("Test 3: Single comparison keyword")
+    primary3 = "test keyword"
+    keywords3 = ["test comparison"]
+    results3 = generate_results(primary3, keywords3)
+    df3 = pd.DataFrame(results3)
+    assert len(df3) == 2, f"❌ Should have 2 rows (1 primary + 1 comparison). Got: {len(df3)}"
+    assert df3.iloc[0]["Primary Keyword"] == "*", "❌ Primary keyword marker missing"
+    assert df3.iloc[1]["Primary Keyword"] == "", "❌ Comparison keyword should have empty Primary Keyword"
+    print("✅ Single comparison keyword handled correctly")
+    print("\n" + "="*60 + "\n")
+    # Test 4: Column order
+    print("Test 4: Column order")
+    expected_order = ["Keyword", "Cosine Similarity", "Primary Keyword"]
+    actual_order = list(df.columns)
+    assert actual_order == expected_order, f"❌ Column order incorrect. Expected: {expected_order}, Got: {actual_order}"
+    print(f"✅ Column order correct: {actual_order}")
+    print("\n" + "="*60 + "\n")
+    # Test 5: Similarity sorting
+    print("Test 5: Results are sorted by similarity (descending)")
+    primary4 = "corporate cards"
+    keywords4 = ["business cards", "credit cards", "debit cards", "office supplies"]
+    results4 = generate_results(primary4, keywords4)
+    df4 = pd.DataFrame(results4)
+    # Skip first row (primary keyword with 1.000) and check rest are sorted
+    similarities = df4.iloc[1:]["Cosine Similarity"].values
+    is_sorted = all(similarities[i] >= similarities[i+1] for i in range(len(similarities)-1))
+    assert is_sorted, "❌ Comparison keywords not sorted by similarity (descending)"
+    print("✅ Results correctly sorted by similarity (descending)")
+    print("\n" + "="*60 + "\n")
+    print("✅ ALL TESTS PASSED! CSV output structure is correct.\n")
+    return True
+if __name__ == "__main__":
+    try:
+        test_csv_structure()
+    except AssertionError as e:
+        print(f"\n❌ TEST FAILED: {e}")
+        exit(1)
+    except Exception as e:
+        print(f"\n❌ ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)