summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNate Buttke <nate-web@riseup.net>2023-07-27 17:48:51 -0700
committerNate Buttke <nate-web@riseup.net>2023-07-27 17:48:51 -0700
commit9d34c71ac0aa68d373a57505c4a9d763476cac7a (patch)
tree357ed7746cccd2c99903d1b1636c8058afc5d622
parentdb824e067d17eba3469a49dffb04566aed3449b2 (diff)
Move tree-sitter fns into new helper class. Need to make language choice more
modular.
-rw-r--r--setup.py140
1 files changed, 74 insertions, 66 deletions
diff --git a/setup.py b/setup.py
index 3003c83..0b1e1cb 100644
--- a/setup.py
+++ b/setup.py
@@ -11,71 +11,76 @@ SOURCE_DIR = './'
openai.api_key = os.getenv('END_OF_WORLD')
-def ts_query(lang, tree, sexp):
- query = lang.query(sexp)
- return query.captures(tree.root_node)
-
-def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code):
- """Use treesitter to get all code blocks"""
-
- # TODO need way to switch between declaration and definition ..
- # e.g. golang does not have function definitions according to treesitter
- results = ts_query(lang, tree, """(function_declaration) @function""")
- results += ts_query(lang, tree, """(method_declaration) @method""")
-
- # TODO something like list comprehension here?
- for r in results:
- return_dict = {
- 'code_type': r[1],
- 'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'),
- 'start_line': r[0].start_point[0],
- 'end_line': r[0].end_point[0],
- 'chars': r[0].end_byte - r[0].start_byte,
- 'file_path': file_path
- }
- code_blocks.append(return_dict)
-
-def parse_file(file_path):
- """take source code file and return pd dataframe"""
- # read file
- with open(file_path, 'r') as f:
- code = f.read()
-
- # Tree-Sitter
- parser = Parser()
- lang = Language("./tree-go.so", "go")
- parser.set_language(lang)
- tree = parser.parse(bytes(code, "utf8"))
-
- code_blocks = []
- ts_get_all_code_blocks(lang, code_blocks, file_path, tree, bytes(code, "utf8"))
-
-
- #TODO
- # collate imports, assign
- collate_types = ['import', 'assign']
- tempblock = None
- finblocks = []
-
- for block in code_blocks:
- if block['code_type'] in collate_types:
- if tempblock is None:
- tempblock = {k:v for k,v in block.items()}
- elif tempblock['code_type'] == block['code_type']:
- tempblock['source'] += f"\n{block['source']}"
- tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
- tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
- tempblock['chars'] += (block['chars'] + 1)
+class TS_Setup_Helper:
+ parser: Parser
+ lang: Language
+
+ def __init__(self, ts_object_path, lang_name):
+ self.parser = Parser()
+ self.lang = Language(ts_object_path, lang_name)
+ self.parser.set_language(self.lang)
+
+ def ts_query(self, lang, tree, sexp):
+ query = lang.query(sexp)
+ return query.captures(tree.root_node)
+
+ def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code):
+ """Use treesitter to get all code blocks"""
+
+ # TODO need way to switch between declaration and definition ..
+ # e.g. golang does not have function definitions according to treesitter
+ results = self.ts_query(self.lang, tree, """(function_declaration) @function""")
+ results += self.ts_query(self.lang, tree, """(method_declaration) @method""")
+
+ # TODO something like list comprehension here?
+ for r in results:
+ return_dict = {
+ 'code_type': r[1],
+ 'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'),
+ 'start_line': r[0].start_point[0],
+ 'end_line': r[0].end_point[0],
+ 'chars': r[0].end_byte - r[0].start_byte,
+ 'file_path': file_path
+ }
+ code_blocks.append(return_dict)
+
+ def parse_file(self, file_path):
+ """take source code file and return pd dataframe"""
+ # read file
+ with open(file_path, 'r') as f:
+ code = f.read()
+
+ # Tree-Sitter
+ tree = self.parser.parse(bytes(code, "utf8"))
+
+ code_blocks = []
+ self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8"))
+
+ #TODO
+ # collate imports, assign
+ collate_types = ['import', 'assign']
+ tempblock = None
+ finblocks = []
+
+ for block in code_blocks:
+ if block['code_type'] in collate_types:
+ if tempblock is None:
+ tempblock = {k:v for k,v in block.items()}
+ elif tempblock['code_type'] == block['code_type']:
+ tempblock['source'] += f"\n{block['source']}"
+ tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
+ tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
+ tempblock['chars'] += (block['chars'] + 1)
+ else:
+ finblocks.append(tempblock)
+ tempblock = {k:v for k,v in block.items()}
else:
- finblocks.append(tempblock)
- tempblock = {k:v for k,v in block.items()}
- else:
- if tempblock is not None:
- finblocks.append(tempblock)
- tempblock = None
- finblocks.append(block)
- df = pd.DataFrame(finblocks)
- return df
+ if tempblock is not None:
+ finblocks.append(tempblock)
+ tempblock = None
+ finblocks.append(block)
+ df = pd.DataFrame(finblocks)
+ return df
def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
@@ -119,15 +124,18 @@ def blobify(pandaSeries):
### doing stuff!!
+ts_helper = TS_Setup_Helper('./tree-go.so', 'go')
+
code_df = pd.DataFrame()
for file in get_files_to_parse("../../dirserver/src/dirserver/"):
- code_df = pd.concat([code_df, parse_file(file)])
+ code_df = pd.concat([code_df, ts_helper.parse_file(file)])
code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
print(type(code_df))
print(code_df)
-code_df.to_csv('test_with_blob.csv')
+code_df.to_csv('1test_with_blob.csv')
+exit()
print('startng to generate summary')
code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))