summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNate Buttke <nate-web@riseup.net>2023-08-12 20:12:27 -0700
committerNate Buttke <nate-web@riseup.net>2023-08-12 20:12:27 -0700
commitc6f18dea08cdf48e1bff3f357ff9c51547f57157 (patch)
tree25af0f2062df2f3c05ac3aae248e467c09e24172
parent33db0cf91c9ce1a79b9b7898374fa7b63336196b (diff)
modify setup to allow lots of languages!
-rw-r--r--.gitmodules3
-rw-r--r--setup.py77
m---------tree-sitter-rust0
-rw-r--r--ts_create_object.py11
4 files changed, 70 insertions, 21 deletions
diff --git a/.gitmodules b/.gitmodules
index 4246223..be06b4e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,6 @@
[submodule "tree-sitter-haskell"]
path = tree-sitter-haskell
url = https://github.com/tree-sitter/tree-sitter-haskell
+[submodule "tree-sitter-rust"]
+ path = tree-sitter-rust
+ url = https://github.com/tree-sitter/tree-sitter-rust
diff --git a/setup.py b/setup.py
index 0b1e1cb..8ff5027 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
from collections import defaultdict
import os
+import sys
import pandas as pd
import openai
import tiktoken
@@ -13,24 +14,59 @@ openai.api_key = os.getenv('END_OF_WORLD')
class TS_Setup_Helper:
parser: Parser
- lang: Language
-
- def __init__(self, ts_object_path, lang_name):
+ ts_obj_path: str
+ ext_map: dict
+
+ def __init__(self, ts_object_path):
+ self.ts_object_path = ts_object_path
+ self.BASH_LANGUAGE = Language(ts_object_path, 'bash')
+ self.C_LANGUAGE = Language(ts_object_path, 'c')
+ self.CPP_LANGUAGE = Language(ts_object_path, 'cpp')
+ self.GO_LANGUAGE = Language(ts_object_path, 'go')
+ self.HS_LANGUAGE = Language(ts_object_path, 'haskell')
+ self.JS_LANGUAGE = Language(ts_object_path, 'javascript')
+ self.PY_LANGUAGE = Language(ts_object_path, 'python')
+ self.RS_LANGUAGE = Language(ts_object_path, 'rust')
self.parser = Parser()
- self.lang = Language(ts_object_path, lang_name)
- self.parser.set_language(self.lang)
+
+ self.ext_map = {
+ 'sh': self.BASH_LANGUAGE,
+ 'c': self.C_LANGUAGE,
+ 'h': self.C_LANGUAGE,
+ 'cpp': self.CPP_LANGUAGE,
+ 'cxx': self.CPP_LANGUAGE,
+ 'hxx': self.CPP_LANGUAGE,
+ 'hpp': self.CPP_LANGUAGE,
+ 'go': self.GO_LANGUAGE,
+ 'hs': self.HS_LANGUAGE,
+ 'js': self.JS_LANGUAGE,
+ 'py': self.PY_LANGUAGE,
+ 'rs': self.RS_LANGUAGE
+ }
+
+ self.qmap = {
+ self.BASH_LANGUAGE: ["""(function_definition) @function""", """(variable_assignment) @assign"""],
+ self.C_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""],
+ self.CPP_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""],
+ self.GO_LANGUAGE: ["""(function_declaration) @function""", """(method_declaration) @method"""],
+ self.JS_LANGUAGE: ["""[(function) (function_declaration)] @function"""],
+ self.PY_LANGUAGE: ["""(function_definition) @function""", """[(import_statement) (import_from_statement)] @import"""],
+ self.RS_LANGUAGE: ["""(function_item) @function""", """(use_declaration) @import"""]
+ }
def ts_query(self, lang, tree, sexp):
query = lang.query(sexp)
return query.captures(tree.root_node)
- def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code):
+ def ts_get_all_code_blocks(self, code_blocks, file_path, lang, tree, code):
"""Use treesitter to get all code blocks"""
# TODO need way to switch between declaration and definition ..
# e.g. golang does not have function definitions according to treesitter
- results = self.ts_query(self.lang, tree, """(function_declaration) @function""")
- results += self.ts_query(self.lang, tree, """(method_declaration) @method""")
+ results = [ ]
+ for query in self.qmap.get(lang):
+ print(query)
+ results += self.ts_query(lang, tree, query)
# TODO something like list comprehension here?
for r in results:
@@ -45,19 +81,23 @@ class TS_Setup_Helper:
code_blocks.append(return_dict)
def parse_file(self, file_path):
+ print('parse')
"""take source code file and return pd dataframe"""
# read file
with open(file_path, 'r') as f:
code = f.read()
# Tree-Sitter
+ extension = os.path.splitext(file_path)[1].lstrip(".")
+ lang = self.ext_map.get(extension)
+ if lang is None:
+ raise NotImplementedError(f"The file extension .{extension} is not implemented")
+ self.parser.set_language(lang)
tree = self.parser.parse(bytes(code, "utf8"))
code_blocks = []
- self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8"))
+ self.ts_get_all_code_blocks(code_blocks, file_path, lang, tree, bytes(code, "utf8"))
- #TODO
- # collate imports, assign
collate_types = ['import', 'assign']
tempblock = None
finblocks = []
@@ -83,7 +123,7 @@ class TS_Setup_Helper:
return df
-def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
+def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
"""get all source file paths as list."""
files_to_parse = []
for root, dirs, files in os.walk(root_path):
@@ -124,18 +164,23 @@ def blobify(pandaSeries):
### doing stuff!!
-ts_helper = TS_Setup_Helper('./tree-go.so', 'go')
+ts_helper = TS_Setup_Helper('./ts-languages.so')
code_df = pd.DataFrame()
-for file in get_files_to_parse("../../dirserver/src/dirserver/"):
+#files = get_files_to_parse("../../dirserver/src/dirserver/", ts_helper.ext_map.keys(), dirs_to_ignore=['tests', 'vendor', 'unix']):
+
+files = get_files_to_parse("./rs", ts_helper.ext_map.keys())
+if len(files) == 0:
+ print("didn't find any files to parse", file=sys.stderr)
+ exit(1)
+for file in files:
code_df = pd.concat([code_df, ts_helper.parse_file(file)])
code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
print(type(code_df))
print(code_df)
-code_df.to_csv('1test_with_blob.csv')
-exit()
+code_df.to_csv('rust_with_blob.csv')
print('startng to generate summary')
code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
diff --git a/tree-sitter-rust b/tree-sitter-rust
new file mode 160000
+Subproject 0a70e15da977489d954c219af9b50b8a722630e
diff --git a/ts_create_object.py b/ts_create_object.py
index 2cf3734..c95e663 100644
--- a/ts_create_object.py
+++ b/ts_create_object.py
@@ -6,12 +6,13 @@ Language.build_library(
# Include one or more languages
[
- 'tree-sitter-javascript',
- 'tree-sitter-python',
- 'tree-sitter-go',
+ 'tree-sitter-bash',
'tree-sitter-c',
'tree-sitter-cpp',
- 'tree-sitter-bash',
- 'tree-sitter-haskell'
+ 'tree-sitter-go',
+ 'tree-sitter-haskell',
+ 'tree-sitter-javascript',
+ 'tree-sitter-python',
+ 'tree-sitter-rust'
]
)