diff options
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | setup.py | 77 | ||||
m--------- | tree-sitter-rust | 0 | ||||
-rw-r--r-- | ts_create_object.py | 11 |
4 files changed, 70 insertions, 21 deletions
diff --git a/.gitmodules b/.gitmodules index 4246223..be06b4e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ [submodule "tree-sitter-haskell"] path = tree-sitter-haskell url = https://github.com/tree-sitter/tree-sitter-haskell +[submodule "tree-sitter-rust"] + path = tree-sitter-rust + url = https://github.com/tree-sitter/tree-sitter-rust @@ -1,5 +1,6 @@ from collections import defaultdict import os +import sys import pandas as pd import openai import tiktoken @@ -13,24 +14,59 @@ openai.api_key = os.getenv('END_OF_WORLD') class TS_Setup_Helper: parser: Parser - lang: Language - - def __init__(self, ts_object_path, lang_name): + ts_obj_path: str + ext_map: dict + + def __init__(self, ts_object_path): + self.ts_object_path = ts_object_path + self.BASH_LANGUAGE = Language(ts_object_path, 'bash') + self.C_LANGUAGE = Language(ts_object_path, 'c') + self.CPP_LANGUAGE = Language(ts_object_path, 'cpp') + self.GO_LANGUAGE = Language(ts_object_path, 'go') + self.HS_LANGUAGE = Language(ts_object_path, 'haskell') + self.JS_LANGUAGE = Language(ts_object_path, 'javascript') + self.PY_LANGUAGE = Language(ts_object_path, 'python') + self.RS_LANGUAGE = Language(ts_object_path, 'rust') self.parser = Parser() - self.lang = Language(ts_object_path, lang_name) - self.parser.set_language(self.lang) + + self.ext_map = { + 'sh': self.BASH_LANGUAGE, + 'c': self.C_LANGUAGE, + 'h': self.C_LANGUAGE, + 'cpp': self.CPP_LANGUAGE, + 'cxx': self.CPP_LANGUAGE, + 'hxx': self.CPP_LANGUAGE, + 'hpp': self.CPP_LANGUAGE, + 'go': self.GO_LANGUAGE, + 'hs': self.HS_LANGUAGE, + 'js': self.JS_LANGUAGE, + 'py': self.PY_LANGUAGE, + 'rs': self.RS_LANGUAGE + } + + self.qmap = { + self.BASH_LANGUAGE: ["""(function_definition) @function""", """(variable_assignment) @assign"""], + self.C_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""], + self.CPP_LANGUAGE: ["""(function_definition) @function""", """(preproc_include) @import"""], + self.GO_LANGUAGE: ["""(function_declaration) @function""", """(method_declaration) @method"""], + self.JS_LANGUAGE: ["""[(function) (function_declaration)] @function"""], + self.PY_LANGUAGE: ["""(function_definition) @function""", """[(import_statement) (import_from_statement)] @import"""], + self.RS_LANGUAGE: ["""(function_item) @function""", """(use_declaration) @import"""] + } def ts_query(self, lang, tree, sexp): query = lang.query(sexp) return query.captures(tree.root_node) - def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code): + def ts_get_all_code_blocks(self, code_blocks, file_path, lang, tree, code): """Use treesitter to get all code blocks""" # TODO need way to switch between declaration and definition .. # e.g. golang does not have function definitions according to treesitter - results = self.ts_query(self.lang, tree, """(function_declaration) @function""") - results += self.ts_query(self.lang, tree, """(method_declaration) @method""") + results = [ ] + for query in self.qmap.get(lang): + print(query) + results += self.ts_query(lang, tree, query) # TODO something like list comprehension here? for r in results: @@ -45,19 +81,23 @@ class TS_Setup_Helper: code_blocks.append(return_dict) def parse_file(self, file_path): + print('parse') """take source code file and return pd dataframe""" # read file with open(file_path, 'r') as f: code = f.read() # Tree-Sitter + extension = os.path.splitext(file_path)[1].lstrip(".") + lang = self.ext_map.get(extension) + if lang is None: + raise NotImplementedError(f"The file extension .{extension} is not implemented") + self.parser.set_language(lang) tree = self.parser.parse(bytes(code, "utf8")) code_blocks = [] - self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8")) + self.ts_get_all_code_blocks(code_blocks, file_path, lang, tree, bytes(code, "utf8")) - #TODO - # collate imports, assign collate_types = ['import', 'assign'] tempblock = None finblocks = [] @@ -83,7 +123,7 @@ class TS_Setup_Helper: return df -def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list: +def get_files_to_parse(root_path, files_extensions_to_parse, dirs_to_ignore=['tests', 'vendor', 'unix']) -> list: """get all source file paths as list.""" files_to_parse = [] for root, dirs, files in os.walk(root_path): @@ -124,18 +164,23 @@ def blobify(pandaSeries): ### doing stuff!! -ts_helper = TS_Setup_Helper('./tree-go.so', 'go') +ts_helper = TS_Setup_Helper('./ts-languages.so') code_df = pd.DataFrame() -for file in get_files_to_parse("../../dirserver/src/dirserver/"): +#files = get_files_to_parse("../../dirserver/src/dirserver/", ts_helper.ext_map.keys(), dirs_to_ignore=['tests', 'vendor', 'unix']): + +files = get_files_to_parse("./rs", ts_helper.ext_map.keys()) +if len(files) == 0: + print("didn't find any files to parse", file=sys.stderr) + exit(1) +for file in files: code_df = pd.concat([code_df, ts_helper.parse_file(file)]) code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1) print(type(code_df)) print(code_df) -code_df.to_csv('1test_with_blob.csv') -exit() +code_df.to_csv('rust_with_blob.csv') print('startng to generate summary') code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x)) diff --git a/tree-sitter-rust b/tree-sitter-rust new file mode 160000 +Subproject 0a70e15da977489d954c219af9b50b8a722630e diff --git a/ts_create_object.py b/ts_create_object.py index 2cf3734..c95e663 100644 --- a/ts_create_object.py +++ b/ts_create_object.py @@ -6,12 +6,13 @@ Language.build_library( # Include one or more languages [ - 'tree-sitter-javascript', - 'tree-sitter-python', - 'tree-sitter-go', + 'tree-sitter-bash', 'tree-sitter-c', 'tree-sitter-cpp', - 'tree-sitter-bash', - 'tree-sitter-haskell' + 'tree-sitter-go', + 'tree-sitter-haskell', + 'tree-sitter-javascript', + 'tree-sitter-python', + 'tree-sitter-rust' ] ) |