devjas1 commited on
Commit
d38cc5e
·
1 Parent(s): 1878de6

(UPDATE): expand .gitattributes to include additional file types for LFS tracking

Browse files

(UPDATE): enhance README with detailed project description and features; refactor embed_documents function for improved error handling and encoding support

.gitattributes CHANGED
@@ -1,2 +1,39 @@
1
  *.gguf filter=lfs diff=lfs merge=lfs -text
2
  C:/Users/xJB6x/Projects/CodeMind/models/embeddinggemma-300m/* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.gguf filter=lfs diff=lfs merge=lfs -text
2
  C:/Users/xJB6x/Projects/CodeMind/models/embeddinggemma-300m/* filter=lfs diff=lfs merge=lfs -text
3
+
4
+
5
+ *.7z filter=lfs diff=lfs merge=lfs -text
6
+ *.arrow filter=lfs diff=lfs merge=lfs -text
7
+ *.bin filter=lfs diff=lfs merge=lfs -text
8
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
9
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.gz filter=lfs diff=lfs merge=lfs -text
12
+ *.h5 filter=lfs diff=lfs merge=lfs -text
13
+ *.joblib filter=lfs diff=lfs merge=lfs -text
14
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
15
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
16
+ *.model filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
18
+ *.npy filter=lfs diff=lfs merge=lfs -text
19
+ *.npz filter=lfs diff=lfs merge=lfs -text
20
+ *.onnx filter=lfs diff=lfs merge=lfs -text
21
+ *.ot filter=lfs diff=lfs merge=lfs -text
22
+ *.parquet filter=lfs diff=lfs merge=lfs -text
23
+ *.pb filter=lfs diff=lfs merge=lfs -text
24
+ *.pickle filter=lfs diff=lfs merge=lfs -text
25
+ *.pkl filter=lfs diff=lfs merge=lfs -text
26
+ *.pt filter=lfs diff=lfs merge=lfs -text
27
+ *.pth filter=lfs diff=lfs merge=lfs -text
28
+ *.rar filter=lfs diff=lfs merge=lfs -text
29
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
30
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
32
+ *.tar filter=lfs diff=lfs merge=lfs -text
33
+ *.tflite filter=lfs diff=lfs merge=lfs -text
34
+ *.tgz filter=lfs diff=lfs merge=lfs -text
35
+ *.wasm filter=lfs diff=lfs merge=lfs -text
36
+ *.xz filter=lfs diff=lfs merge=lfs -text
37
+ *.zip filter=lfs diff=lfs merge=lfs -text
38
+ *.zst filter=lfs diff=lfs merge=lfs -text
39
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: CodeMind
3
- emoji: 🏆
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: static
@@ -9,14 +9,18 @@ license: apache-2.0
9
  short_description: AI-powered development assistant CLI Tool
10
  ---
11
 
12
- ## CodeMind: Local AI Development Assistant
13
 
14
- CodeMind is an AI-powered development assistant that runs entirely on your local machine. It helps you understand your codebase through semantic search and generates meaningful commit messages using locally hosted language models, ensuring complete privacy and no cloud dependencies.
 
 
15
 
16
  ## Features
17
 
18
- - **Semantic Code Search**: Find relevant code and documentation using AI-powered semantic search
19
- - **Commit Message Generation**: Automatically generate descriptive commit messages based on your changes
 
 
20
  - **Local Processing**: All AI processing happens on your machine with no data sent to cloud services
21
  - **Flexible Configuration**: Customize models and parameters to suit your specific needs
22
  - **FAISS Integration**: Efficient vector similarity search for fast retrieval
 
1
  ---
2
  title: CodeMind
3
+ emoji: 🔧
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: static
 
9
  short_description: AI-powered development assistant CLI Tool
10
  ---
11
 
12
+ **CodeMind** is a AI-powered development assistant that runs entirely on your local machine for intelligent document analysis and commit message generation. It leverages modern machine learning models for: helping you understand your codebase through semantic search and generates meaningful commit messages using locally hosted language models, ensuring complete privacy and no cloud dependencies.
13
 
14
+ - **Efficient Knowledge Retrieval**: Makes searching and querying documentation more powerful by using semantic embeddings rather than keyword search.
15
+ - **Smarter Git Workflow**: Automates the creation of meaningful commit messages by analyzing git diffs and using an LLM to summarize changes.
16
+ - **AI-Powered Documentation**: Enables you to ask questions about your project, using your own docs/context rather than just generic answers.
17
 
18
  ## Features
19
 
20
+ - **Document Embedding** (using [EmbeddingGemma-300m](https://huggingface.co/google/embeddinggemma-300m))
21
+ - **Semantic Search** (using [FAISS](https://github.com/facebookresearch/faiss) for vector similarity search)
22
+ - **Commit Message Generation** (using [Phi-2](https://huggingface.co/microsoft/phi-2-gguf) for text generation): Automatically generate descriptive commit messages based on your changes
23
+ - **Retrieval-Augmented Generation (RAG)**: Answers questions using indexed document context
24
  - **Local Processing**: All AI processing happens on your machine with no data sent to cloud services
25
  - **Flexible Configuration**: Customize models and parameters to suit your specific needs
26
  - **FAISS Integration**: Efficient vector similarity search for fast retrieval
src/__pycache__.py ADDED
File without changes
src/__pycache__/config_loader.cpython-310.pyc DELETED
Binary file (763 Bytes)
 
src/__pycache__/diff_analyzer.cpython-310.pyc DELETED
Binary file (1.1 kB)
 
src/__pycache__/embedder.cpython-310.pyc DELETED
Binary file (924 Bytes)
 
src/__pycache__/generator.cpython-310.pyc DELETED
Binary file (1.28 kB)
 
src/__pycache__/retriever.cpython-310.pyc DELETED
Binary file (647 Bytes)
 
src/embedder.py CHANGED
@@ -2,26 +2,30 @@
2
  This script handles document embedding using EmbeddingGemma.
3
  This is the entry point for indexing documents.
4
  """
5
-
 
 
6
  import os
7
  import pickle
8
- import faiss
9
- import numpy as np
10
- from sentence_transformers import SentenceTransformer
11
 
12
 
13
- def embed_documents(path: str, config: dict):
14
  """
15
  Embed documents from a directory and save to FAISS index.
16
 
17
  Args:
18
  path (str): Path to the directory containing the documents to embed.
19
  config (dict): Configuration dictionary.
 
 
 
20
  """
21
  try:
22
  model = SentenceTransformer(config["embedding"]["model_path"])
23
- print(f"Initalized embedding model: {config['embedding']['model_path']}")
24
- except ValueError as e:
 
25
  print(f"Error initializing embedding model: {e}")
26
  return []
27
 
@@ -34,38 +38,72 @@ def embed_documents(path: str, config: dict):
34
  fpath = os.path.join(path, fname)
35
  if os.path.isfile(fpath):
36
  try:
37
- with open(fpath, "r", encoding="utf-8") as f:
38
- text = f.read()
39
- if text.strip(): # Only process non-empty files
40
- emb = model.encode(text)
41
- embeddings.append(emb)
42
- texts.append(text)
43
- filenames.append(fname)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  except Exception as e:
45
- print(f"Error reading file {fpath}: {e}")
46
 
47
  if not embeddings:
48
  print("No documents were successfully embedded.")
49
  return []
50
 
 
 
 
 
51
  # Create FAISS index
52
  dimension = embeddings[0].shape[0]
53
  index = faiss.IndexFlatIP(dimension)
54
 
55
- # Normalize embeddings for cosine similarity
56
  embeddings_matrix = np.array(embeddings).astype("float32")
57
- faiss.normalize_L2(embeddings_matrix)
58
- # Add embeddings to index
 
59
  index.add(embeddings_matrix)
60
 
61
  # Save FAISS index and metadata
62
  os.makedirs("vector_cache", exist_ok=True)
63
  faiss.write_index(index, "vector_cache/faiss_index.bin")
64
 
 
65
  with open("vector_cache/metadata.pkl", "wb") as f:
66
  pickle.dump({"texts": texts, "filenames": filenames}, f)
67
 
68
- print(f"Saved FAISS index to vector_cache/ with {len(embeddings)} documents.")
 
69
  print(f"Total embeddings created: {len(embeddings)}")
70
 
71
  return list(zip(filenames, embeddings))
 
 
 
 
 
 
 
 
 
 
 
2
  This script handles document embedding using EmbeddingGemma.
3
  This is the entry point for indexing documents.
4
  """
5
+ from sentence_transformers import SentenceTransformer
6
+ import numpy as np
7
+ import faiss
8
  import os
9
  import pickle
10
+ from typing import List, Tuple
 
 
11
 
12
 
13
+ def embed_documents(path: str, config: dict) -> List[Tuple[str, np.ndarray]]:
14
  """
15
  Embed documents from a directory and save to FAISS index.
16
 
17
  Args:
18
  path (str): Path to the directory containing the documents to embed.
19
  config (dict): Configuration dictionary.
20
+
21
+ Returns:
22
+ List of tuples containing (filename, embedding)
23
  """
24
  try:
25
  model = SentenceTransformer(config["embedding"]["model_path"])
26
+ print(
27
+ f"Initialized embedding model: {config['embedding']['model_path']}")
28
+ except Exception as e: # Changed to catch broader exception
29
  print(f"Error initializing embedding model: {e}")
30
  return []
31
 
 
38
  fpath = os.path.join(path, fname)
39
  if os.path.isfile(fpath):
40
  try:
41
+ # Try different encodings to handle various file types
42
+ for encoding in ['utf-8', 'latin-1', 'cp1252']:
43
+ try:
44
+ with open(fpath, "r", encoding=encoding) as f:
45
+ text = f.read()
46
+ break
47
+ except UnicodeDecodeError:
48
+ continue
49
+ else:
50
+ print(
51
+ f"Could not decode file {fpath} with common encodings")
52
+ continue
53
+
54
+ if text.strip(): # Only process non-empty files
55
+ emb = model.encode(text)
56
+ # Ensure all embeddings have the same dimension
57
+ if embeddings and emb.shape[0] != embeddings[0].shape[0]:
58
+ print(f"Dimension mismatch in file {fname}, skipping")
59
+ continue
60
+
61
+ embeddings.append(emb)
62
+ texts.append(text)
63
+ filenames.append(fname)
64
+
65
  except Exception as e:
66
+ print(f"Error processing file {fpath}: {e}")
67
 
68
  if not embeddings:
69
  print("No documents were successfully embedded.")
70
  return []
71
 
72
+ print("Embedder script started", flush=True)
73
+ print(f"Documents in path: {os.listdir(path)}")
74
+ print(f"Successfully processed {len(embeddings)} documents")
75
+
76
  # Create FAISS index
77
  dimension = embeddings[0].shape[0]
78
  index = faiss.IndexFlatIP(dimension)
79
 
80
+ # Convert to numpy array and normalize
81
  embeddings_matrix = np.array(embeddings).astype("float32")
82
+ faiss.normalize_L2(embeddings_matrix) # Normalize for cosine similarity
83
+
84
+ # Add normalized embeddings to index
85
  index.add(embeddings_matrix)
86
 
87
  # Save FAISS index and metadata
88
  os.makedirs("vector_cache", exist_ok=True)
89
  faiss.write_index(index, "vector_cache/faiss_index.bin")
90
 
91
+ # Save metadata
92
  with open("vector_cache/metadata.pkl", "wb") as f:
93
  pickle.dump({"texts": texts, "filenames": filenames}, f)
94
 
95
+ print(
96
+ f"Saved FAISS index to vector_cache/ with {len(embeddings)} documents.")
97
  print(f"Total embeddings created: {len(embeddings)}")
98
 
99
  return list(zip(filenames, embeddings))
100
+
101
+
102
+ # Example usage
103
+ if __name__ == "__main__":
104
+ config = {
105
+ "embedding": {
106
+ "model_path": "sentence-transformers/all-MiniLM-L6-v2" # Example model
107
+ }
108
+ }
109
+ result = embed_documents("./docs", config)