huge fix to path handling. added clean cli, cost estimate, ignorefile.

author: Nate Buttke <nate-web@riseup.net> 2023-08-14 20:35:45 -0700
committer: Nate Buttke <nate-web@riseup.net> 2023-08-14 20:35:45 -0700
commit: 7435e423776c7b35b9c6c9bebba25a44691554bf (patch)
tree: 2196a0e68bc9ff2c2df92590444eee8b785f4a11
parent: f334391613e01057d572e0228aa4f3c2f24346dc (diff)
3 files changed, 263 insertions, 159 deletions
diff --git a/requirements.txt b/requirements.txt
index 1beac1b..b60ffb4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
-Flask==2.3.2
-numpy==1.25.0
-openai==0.27.8
-pandas==2.0.3
-tiktoken==0.4.0
-tree_sitter==0.20.1
+Flask
+numpy
+openai
+pandas
+tiktoken
+tree_sitter
+typer
+typing_extensions
diff --git a/server.py b/server.py
index 2bb203d..1cd914c 100644
--- a/server.py
+++ b/server.py
@@ -1,49 +1,56 @@
-from flask import Flask, request, Response, jsonify, render_template
-import pandas as pd
 from collections import defaultdict
 import os
-import json
+import sys
+from pathlib import PurePosixPath
+import base64
+from flask import Flask, request, jsonify, render_template
+import pandas as pd
 import openai
 from openai.embeddings_utils import get_embedding, cosine_similarity
 openai.api_key = os.getenv('OPENAI_KEY')
-import numpy as np
-from pathlib import PurePosixPath
 
-FILETYPES = ['.sh', '.c', '.h', '.cpp', '.cxx', '.hxx', '.hpp', '.go', '.hs', '.js', '.py', '.rs']
+if len(sys.argv) != 2:
+    print("USAGE: python server.py PATH_TO_CSV")
+    print("wrong number of arguments", file=sys.stderr)
+    sys.exit(1)
+try:
+    df=pd.read_csv(sys.argv[1], converters={"embedding_summary": pd.eval})
+except:
+    print(f"Problem opening {sys.argv[1]}", file=sys.stderr)
+    sys.exit(1)
+
 
 app = Flask(__name__, template_folder="./frontend", static_folder="./frontend", static_url_path="")
 
 def search_code(df, query, n=4):
     query_embedding = get_embedding(
-        query,
-        engine="text-embedding-ada-002"
-    )
+            query,
+            engine="text-embedding-ada-002"
+            )
     df["similarity"] = df.embedding_summary.apply(lambda x: cosine_similarity(x, query_embedding))
     results = (
-        df.sort_values("similarity", ascending=False)
-    )
+            df.sort_values("similarity", ascending=False)
+            )
     return results.head(n)
 
-
 def generate_answer(question):
-  results = search_code(df, question, n=4)
-  prompt = ''
-  for i in range(3):
-    prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n"
-  prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: "
-  response = openai.Completion.create(
-    model="text-davinci-003",
-    # model="code-davinci-002",
-    prompt=prompt,
-    temperature=0.7,
-    max_tokens=1000,
-    top_p=1.0,
-    frequency_penalty=0.0,
-    presence_penalty=0.0,
-    stop=["\"\"\""]
-  )
-  return response["choices"][0]["text"]
-
+    results = search_code(df, question, n=4)
+    prompt = ''
+    for i in range(3):
+        prompt += results.iloc[i]["summary"] + "\n" + results.iloc[i]["blob"] + "\n"
+    prompt += "\n" + "Answer the following question using the code context given above, and show an example with 'Example'\nQ: " + question + "\nA: "
+    response = openai.Completion.create(
+            model="text-davinci-003",
+            # model="code-davinci-002",
+            prompt=prompt,
+            temperature=0.7,
+            max_tokens=1000,
+            top_p=1.0,
+            frequency_penalty=0.0,
+            presence_penalty=0.0,
+            stop=["\"\"\""]
+            )
+    return response["choices"][0]["text"]
 
 def add_to_tree(tree: dict, path: str):
     parts = PurePosixPath(path).parts
@@ -65,72 +72,111 @@ def create_directory_tree(df):
         add_to_tree(directory_tree, path)
     return directory_tree
 
-# Nate: these are from the original project. My own csv is below
-#df = pd.read_csv("./frontend/data/embedded_summarized.csv")
-#df = pd.read_csv("./frontend/data/withsummary2.csv")
+def get_outermost_item(dirstructure):
+    return list(dirstructure.keys())[0]
 
-# My line
-df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval})
+def get_kids_of_root(dirstructure):
+    return list(dirstructure.values())[0].get("children").keys()
 
-# need to do funny stuff to read in the data frame correctly from csv. that's
-# why the eval() is below. and pd.eval is above.
-# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x))
 
 def check_path(path, dirstructure):
-  children = None
-  components = PurePosixPath(path).parts
-  if(components[0] in dirstructure.keys()):
-    print('SCREAM')
-
-    currentdict = dirstructure.get(components[0]) #outermost is solitary by setup.py
-    print(len(components))
-    for component in components[1:]:
-      print('COMP', component)
-      if component in currentdict.get("children", {}):
-        currentdict = currentdict["children"][component]
-      else:
-        return False, None, children
-      if currentdict["filetype"] == "dir":
-        children = currentdict["children"]
-      return True, currentdict["filetype"], children
-  else:
+    components = PurePosixPath(path).parts
+    if components[0] in dirstructure.keys():
+        if dirstructure[components[0]]["filetype"] == "dir":
+            subdict = dirstructure.get(components[0])
+            if len(components) == 1:
+                ftype = subdict["filetype"]
+                kids = list(subdict["children"].keys())
+                return True, str(ftype), kids
+            else:
+                for c in components[1:]:
+                    if c in subdict["children"]:
+                        found = True
+                        subdict = subdict["children"].get(c)
+                        ftype = subdict["filetype"]
+                        kids = list(subdict["children"].keys()) if ftype == 'dir' else None
+                    else:
+                        found = False
+                if found:
+                    return found, str(ftype), kids
+        else:
+            return True, "file", None
     return False, None, None
 
+
+# need to do this to read in the data frame correctly from csv. (pd.eval)
+#df=pd.read_csv("./frontend/data/test_with_summary_and_embeddings.csv", converters={'embedding_summary': pd.eval})
+#df=pd.read_csv("./frontend/data/r2.csv", converters={'embedding_summary': pd.eval})
+
+# old strategy:
+# df['embeddings'] = df['embeddings'].apply(lambda x: eval(x))
+
 @app.route('/')
 def home():
-  req_path = request.args.get('path')
-  dirstructure = create_directory_tree(df)
-  if req_path is None:
-    loctype = "folder"
-    text = [[x, ""] for x in list(df['file_path'].unique())]
-  else:
-    text = [[x, y] for x, y in zip(
-      list(df[df['file_path'] == req_path]['source']),
-      list(df[df['file_path'] == req_path]['summary'])
-        )]
-    loctype = "file"
-
-  res = {
-    'loctype': loctype,
-    'text': text,
-    'current': 'root directory' if (req_path is None) else req_path
-  }
-  return render_template('index.html', payload=res)
+    req_path = request.args.get('path')
+    path_decode = None if req_path is None else base64.urlsafe_b64decode(req_path).decode('utf-8')
+    dirstructure = create_directory_tree(df)
+    if req_path is None:
+        rootname = get_outermost_item(dirstructure)
+        if check_path(rootname, dirstructure)[1 == "file"]:
+            if len(dirstructure.keys()) == 1:
+                loctype = "file"
+                text = [[(x, None), y] for x, y in zip(
+                    list(df[df["file_path"] == rootname]["source"]),
+                    list(df[df["file_path"] == rootname]["summary"])
+                    )]
+            else:
+                loctype = "folder"
+                files = list(dirstructure.keys())
+                text = [[(x, base64.urlsafe_b64encode(bytes(x, 'utf-8')).decode("utf-8")), ""] for x in files]
+        else:
+            loctype = "folder"
+            if not rootname.endswith("/"):
+                rootname += '/'
+            kids = get_kids_of_root(dirstructure)
+            text = [[(k, base64.urlsafe_b64encode(bytes(rootname + k, 'utf-8')).decode('utf-8')), ""] for k in kids]
+    else:
+        path_info = check_path(path_decode, dirstructure)
+        if path_info[0] is False:
+            text = [["error", "path not found"]]
+            loctype = "file"
+        elif path_info[1] == "file":
+            loctype = "file"
+            text = [[(x, None), y] for x, y in zip(
+                list(df[df["file_path"] == path_decode]["source"]),
+                list(df[df["file_path"] == path_decode]["summary"])
+                )]
+        elif path_info[1] == "dir":
+            loctype = "folder"
+            text = [[(x, base64.urlsafe_b64encode(bytes(path_decode + "/" + x, 'utf-8')).decode("utf-8")), ""] for x in path_info[2]]
+
+    if req_path is not None:
+        curr = path_decode
+    elif loctype == "folder":
+        curr = 'root directory'
+    else:
+        curr = get_outermost_item(dirstructure)
+
+    res = {
+          'loctype': loctype,
+          'text': text,
+          'current': curr
+          }
+    return render_template('index.html', payload=res)
 
 
 @app.route('/answer')
 def answer():
-  q = request.args.get('q', '').strip()
-  a = search_code(df, q)
-  res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')]
-
-  return jsonify(res)
+    q = request.args.get('q', '').strip()
+    a = search_code(df, q)
+    res = [{'blob': x['blob'], 'summary': x['summary']} for x in a.to_dict('records')]
+    return jsonify(res)
 
 @app.route('/explain')
 def explain():
-  q = request.args.get('q', '').strip()
-  a = generate_answer(q)
-  return jsonify(a)
+    q = request.args.get('q', '').strip()
+    a = generate_answer(q)
+    return jsonify(a)
 
 
 if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index 8ff5027..9efb57c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,19 @@
-from collections import defaultdict
 import os
 import sys
 import pandas as pd
-import openai 
+import openai
 import tiktoken
-from openai.embeddings_utils import get_embedding, cosine_similarity
-
+from openai.embeddings_utils import get_embedding
 from tree_sitter import Language, Parser
-
-SOURCE_DIR = './'
+from typing_extensions import Annotated
+import typer
 
 openai.api_key = os.getenv('END_OF_WORLD')
 
 class TS_Setup_Helper:
+    """
+    Tree sitter functions and data for the setup process
+    """
     parser: Parser
     ts_obj_path: str
     ext_map: dict
@@ -50,7 +51,8 @@ class TS_Setup_Helper:
             self.CPP_LANGUAGE:  ["""(function_definition) @function""", """(preproc_include) @import"""],
             self.GO_LANGUAGE:   ["""(function_declaration) @function""", """(method_declaration) @method"""],
             self.JS_LANGUAGE:   ["""[(function) (function_declaration)] @function"""],
-            self.PY_LANGUAGE:   ["""(function_definition) @function""", """[(import_statement) (import_from_statement)] @import"""],
+            self.PY_LANGUAGE:   ["""(function_definition) @function""",
+                                 """[(import_statement) (import_from_statement)] @import"""],
             self.RS_LANGUAGE:    ["""(function_item) @function""", """(use_declaration) @import"""]
         }
 
@@ -61,11 +63,8 @@ class TS_Setup_Helper:
     def ts_get_all_code_blocks(self, code_blocks, file_path, lang, tree, code):
         """Use treesitter to get all code blocks"""
 
-        # TODO need way to switch between declaration and definition ..
-        # e.g. golang does not have function definitions according to treesitter
         results = [ ]
         for query in self.qmap.get(lang):
-            print(query)
             results += self.ts_query(lang, tree, query)
 
         # TODO something like list comprehension here?
@@ -81,22 +80,21 @@ class TS_Setup_Helper:
             code_blocks.append(return_dict)
 
     def parse_file(self, file_path):
-        print('parse')
         """take source code file and return pd dataframe"""
         # read file
-        with open(file_path, 'r') as f:
+        with open(file_path[0], 'r') as f:
             code = f.read()
 
         # Tree-Sitter
-        extension = os.path.splitext(file_path)[1].lstrip(".")
+        extension = os.path.splitext(file_path[0])[1].lstrip(".")
         lang = self.ext_map.get(extension)
         if lang is None:
-            raise NotImplementedError(f"The file extension .{extension} is not implemented")
+            raise NotImplementedError(f"The file extension .{extension} is not implemented ({file_path[0]})")
         self.parser.set_language(lang)
         tree = self.parser.parse(bytes(code, "utf8"))
 
         code_blocks = []
-        self.ts_get_all_code_blocks(code_blocks, file_path, lang, tree, bytes(code, "utf8"))
+        self.ts_get_all_code_blocks(code_blocks, file_path[1], lang, tree, bytes(code, "utf8"))
 
         collate_types = ['import', 'assign']
         tempblock = None
@@ -123,72 +121,130 @@ class TS_Setup_Helper:
         return df
 
 
-def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
+def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore) -> list:
     """get all source file paths as list."""
     files_to_parse = []
     for root, dirs, files in os.walk(root_path):
-        # there is probably a better way to do this
+        # there may be a better way to do this
         # https://stackoverflow.com/questions/13454164/os-walk-without-hidden-folders
-        files = [f for f in files if not f[0] == '.']
+        files = [
+                    f for f in files if (not f[0] == '.')
+                    and (os.path.splitext(f)[-1].lstrip(".") in files_extensions_to_parse)
+                ]
         dirs[:] = [d for d in dirs if (not d[0] == '.') and (set(d.split()).isdisjoint(dirs_to_ignore))]
         for name in files:
-            #if (dirfix(root).rsplit("/", 1)[-1] in dirs_to_ignore) or (name in dirs_to_ignore) or (name.rsplit('.')[-1] not in files_extensions_to_parse):
-            if (name.rsplit('.')[-1] not in files_extensions_to_parse):
-                continue
-            temp_path = os.path.join(root, name)
-            files_to_parse.append(temp_path)
+            full = os.path.join(root, name)
+            rel_dir = os.path.relpath(root, root_path)
+            rel_filepath = os.path.join(rel_dir, name)
+            if rel_filepath.startswith("./"):
+                rel_filepath = rel_filepath[len("./"):]
+            files_to_parse.append((full, rel_filepath))
     return files_to_parse
 
 def generate_summary(prompt):
-  enc = tiktoken.encoding_for_model("text-davinci-003")
-  if (len(enc.encode(prompt)) > 2500):
-      return "too long to summarize."
-
-  prompt = prompt + '\nSummarize the above code: '
-  response = openai.Completion.create(
-    model="text-davinci-003",
-    prompt=prompt,
-    temperature=0.7,
-    max_tokens=1024,
-    top_p=1.0,
-    frequency_penalty=0.0,
-    presence_penalty=0.0,
-    stop=["\"\"\""]
-  )
-  return response["choices"][0]["text"]
-
-
-# nate function to create blob. the blob just contains the file path and the source code.
+    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    if (len(enc.encode(prompt)) > 3000):
+        return "too long to summarize."
+
+    prompt = prompt + '\nSummarize the above code: '
+
+ # response = openai.ChatCompletion.create(
+ #   model="gpt-3.5-turbo",
+ #   messages=[{"role": "user", "content": prompt}],
+ #   temperature=0.7,
+ #   max_tokens=1024,
+ #   top_p=1.0,
+ #   frequency_penalty=0.0,
+ #   presence_penalty=0.0,
+ #   stop=["\"\"\""]
+ # )
+
+    #return response["choices"][0]["message"]["content"]
+    return 'herro. this is a test summary'
+
+# create blob. the blob just contains the file path and the source code.
 def blobify(pandaSeries):
     return f"file path: {pandaSeries['file_path']}\n {pandaSeries['source']}"
 
-
-### doing stuff!!
-ts_helper = TS_Setup_Helper('./ts-languages.so')
-
-code_df = pd.DataFrame()
-#files = get_files_to_parse("../../dirserver/src/dirserver/", ts_helper.ext_map.keys(), dirs_to_ignore=['tests', 'vendor', 'unix']):
-
-files = get_files_to_parse("./rs", ts_helper.ext_map.keys())
-if len(files) == 0:
-    print("didn't find any files to parse", file=sys.stderr)
-    exit(1)
-for file in files:
-    code_df = pd.concat([code_df, ts_helper.parse_file(file)])
-
-code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
-print(type(code_df))
-print(code_df)
-
-code_df.to_csv('rust_with_blob.csv')
-
-print('startng to generate summary')
-code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
-print('done with generate summary')
-
-print('generating embeddings')
-embedding_model = "text-embedding-ada-002"
-code_df["embedding_summary"] = code_df.summary.apply([lambda x: get_embedding(x, engine=embedding_model)])
-print('done with generating embeddings')
-
-code_df.to_csv('test_with_summary_and_embeddings.csv')
+def estimate_cost(df, skip_summary: bool):
+    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
+    print(f'found {len(df.blob)} fns')
+    token_count = 0
+    for s in df.blob:
+        token_count += len(enc.encode(s))
+    embed_cost = (token_count / 1000) * 0.0001  # Ada v2
+    print(f"it will cost ~${embed_cost:.6f} to generate embeddings")
+
+    if not skip_summary:
+        enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
+        token_count = 0
+        for s in df.blob:
+            token_count += len(enc.encode(s))
+        summary_cost = ((token_count / 1000) * 0.0015) + ( len(df.blob) * (500/1000) * 0.002)
+        print(f"it will cost ~${summary_cost:.6f} to generate summaries (see --skip-summary)")
+        print(f"which is ~${embed_cost + summary_cost:.6f} total.")
+
+    if input("\nType yes to continue or anything else to quit: ") != "yes":
+        sys.exit(0)
+    return
+
+
+def setup(
+        filepath: Annotated[str, typer.Argument(help="path to repo")],
+        output_csv_filepath: Annotated[str, typer.Argument(help="filepath for csv output")],
+        ignorefile: Annotated[str, typer.Option(help="Path to text file containing dirnames to ignore. One name per line.")] = None,
+        skip_summary: Annotated[bool, typer.Option(help="Do not produce summaries for each function (to save cost).")] = False
+        ):
+
+    dirs_to_ignore = []
+    if ignorefile != None:
+        #https://stackoverflow.com/questions/3925614/how-do-you-read-a-file-into-a-list-in-python
+        try:
+            with open(ignorefile) as file:
+                for line in file:
+                    line = line.strip()
+                    dirs_to_ignore.append(line)
+        except:
+            print(f"IO error while procesing {ignorefile}", file=sys.stderr)
+
+    ts_helper = TS_Setup_Helper('./ts-languages.so')
+    code_df = pd.DataFrame()
+
+    files = get_files_to_parse( filepath, list(ts_helper.ext_map.keys()), dirs_to_ignore)
+
+    if len(files) == 0:
+        print("didn't find any files to parse", file=sys.stderr)
+        sys.exit(1)
+    for file in files:
+        #print(file)
+        code_df = pd.concat([code_df, ts_helper.parse_file(file)])
+
+    code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
+
+    code_df.to_csv('rust_with_blob.csv')
+
+    estimate_cost(code_df, skip_summary)
+
+    if not skip_summary:
+        print('generating summary')
+        code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
+        print('done with summaries')
+    else: 
+        code_df["summary"] = "no summary. --skip-summary"
+
+    print('generating embeddings')
+    embedding_model = "text-embedding-ada-002"
+    #code_df["embedding_summary"] = code_df.summary.apply(
+    #        [lambda x: get_embedding(x, engine=embedding_model)]
+    #        )
+    print('done with embeddings')
+    code_df.to_csv(output_csv_filepath)
+
+    sys.exit(0)
+
+if __name__ == "__main__":
+    typer.run(setup)
+    #setup('YOUR_PATH_HERE', ['ignore', 'dirs', 'here'])
+    #setup("../../openpilot/", "./ope.csv",
+    #        ['tests', 'vendor', 'unix', 'test', 'debug', 'ui', 'third_party', 'tools', 'system']
+    #      )
author	Nate Buttke <nate-web@riseup.net>	2023-08-14 20:35:45 -0700
committer	Nate Buttke <nate-web@riseup.net>	2023-08-14 20:35:45 -0700
commit	7435e423776c7b35b9c6c9bebba25a44691554bf (patch)
tree	2196a0e68bc9ff2c2df92590444eee8b785f4a11
parent	f334391613e01057d572e0228aa4f3c2f24346dc (diff)