Move tree-sitter fns into new helper class. Need to make language choice more

modular.
author: Nate Buttke <nate-web@riseup.net> 2023-07-27 17:48:51 -0700
committer: Nate Buttke <nate-web@riseup.net> 2023-07-27 17:48:51 -0700
commit: 9d34c71ac0aa68d373a57505c4a9d763476cac7a (patch)
tree: 357ed7746cccd2c99903d1b1636c8058afc5d622
parent: db824e067d17eba3469a49dffb04566aed3449b2 (diff)
1 files changed, 74 insertions, 66 deletions
diff --git a/setup.py b/setup.py
index 3003c83..0b1e1cb 100644
--- a/setup.py
+++ b/setup.py
@@ -11,71 +11,76 @@ SOURCE_DIR = './'
 
 openai.api_key = os.getenv('END_OF_WORLD')
 
-def ts_query(lang, tree, sexp):
-    query = lang.query(sexp)
-    return query.captures(tree.root_node)
-
-def ts_get_all_code_blocks(lang, code_blocks, file_path, tree, code):
-    """Use treesitter to get all code blocks"""
-
-    # TODO need way to switch between declaration and definition ..
-    # e.g. golang does not have function definitions according to treesitter
-    results = ts_query(lang, tree, """(function_declaration) @function""")
-    results += ts_query(lang, tree, """(method_declaration) @method""")
-
-    # TODO something like list comprehension here?
-    for r in results:
-        return_dict = {
-            'code_type': r[1],
-            'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'),
-            'start_line': r[0].start_point[0],
-            'end_line': r[0].end_point[0],
-            'chars': r[0].end_byte - r[0].start_byte,
-            'file_path': file_path
-        }
-        code_blocks.append(return_dict)
-
-def parse_file(file_path):
-    """take source code file and return pd dataframe"""
-    # read file
-    with open(file_path, 'r') as f:
-        code = f.read()
-
-    # Tree-Sitter
-    parser = Parser()
-    lang = Language("./tree-go.so", "go")
-    parser.set_language(lang)
-    tree = parser.parse(bytes(code, "utf8"))
-
-    code_blocks = []
-    ts_get_all_code_blocks(lang, code_blocks, file_path, tree, bytes(code, "utf8"))
-
-
-    #TODO
-    # collate imports, assign
-    collate_types = ['import', 'assign']
-    tempblock = None
-    finblocks = []
-
-    for block in code_blocks:
-        if block['code_type'] in collate_types:
-            if tempblock is None:
-                tempblock = {k:v for k,v in block.items()}
-            elif tempblock['code_type'] == block['code_type']:
-                tempblock['source'] += f"\n{block['source']}"
-                tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
-                tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
-                tempblock['chars'] += (block['chars'] + 1)
+class TS_Setup_Helper:
+    parser: Parser
+    lang: Language
+
+    def __init__(self, ts_object_path, lang_name):
+        self.parser = Parser()
+        self.lang = Language(ts_object_path, lang_name)
+        self.parser.set_language(self.lang)
+
+    def ts_query(self, lang, tree, sexp):
+        query = lang.query(sexp)
+        return query.captures(tree.root_node)
+
+    def ts_get_all_code_blocks(self, code_blocks, file_path, tree, code):
+        """Use treesitter to get all code blocks"""
+
+        # TODO need way to switch between declaration and definition ..
+        # e.g. golang does not have function definitions according to treesitter
+        results = self.ts_query(self.lang, tree, """(function_declaration) @function""")
+        results += self.ts_query(self.lang, tree, """(method_declaration) @method""")
+
+        # TODO something like list comprehension here?
+        for r in results:
+            return_dict = {
+                'code_type': r[1],
+                'source': code[r[0].start_byte:r[0].end_byte].decode('utf-8'),
+                'start_line': r[0].start_point[0],
+                'end_line': r[0].end_point[0],
+                'chars': r[0].end_byte - r[0].start_byte,
+                'file_path': file_path
+            }
+            code_blocks.append(return_dict)
+
+    def parse_file(self, file_path):
+        """take source code file and return pd dataframe"""
+        # read file
+        with open(file_path, 'r') as f:
+            code = f.read()
+
+        # Tree-Sitter
+        tree = self.parser.parse(bytes(code, "utf8"))
+
+        code_blocks = []
+        self.ts_get_all_code_blocks(code_blocks, file_path, tree, bytes(code, "utf8"))
+
+        #TODO
+        # collate imports, assign
+        collate_types = ['import', 'assign']
+        tempblock = None
+        finblocks = []
+
+        for block in code_blocks:
+            if block['code_type'] in collate_types:
+                if tempblock is None:
+                    tempblock = {k:v for k,v in block.items()}
+                elif tempblock['code_type'] == block['code_type']:
+                    tempblock['source'] += f"\n{block['source']}"
+                    tempblock['start_line'] = min(tempblock['start_line'], block['start_line'])
+                    tempblock['end_line'] = max(tempblock['start_line'], block['end_line'])
+                    tempblock['chars'] += (block['chars'] + 1)
+                else:
+                    finblocks.append(tempblock)
+                    tempblock = {k:v for k,v in block.items()}
             else:
-                finblocks.append(tempblock)
-                tempblock = {k:v for k,v in block.items()}
-        else:
-            if tempblock is not None:
-                finblocks.append(tempblock)
-                tempblock = None
-            finblocks.append(block)
-    df = pd.DataFrame(finblocks)
-    return df
+                if tempblock is not None:
+                    finblocks.append(tempblock)
+                    tempblock = None
+                finblocks.append(block)
+        df = pd.DataFrame(finblocks)
+        return df
 
 
 def get_files_to_parse(root_path, files_extensions_to_parse=['go'], dirs_to_ignore=['tests', 'vendor', 'unix']) -> list:
@@ -119,15 +124,18 @@ def blobify(pandaSeries):
 
 
 ### doing stuff!!
+ts_helper = TS_Setup_Helper('./tree-go.so', 'go')
+
 code_df = pd.DataFrame()
 for file in get_files_to_parse("../../dirserver/src/dirserver/"):
-    code_df = pd.concat([code_df, parse_file(file)])
+    code_df = pd.concat([code_df, ts_helper.parse_file(file)])
 
 code_df["blob"] = code_df.apply(lambda x: blobify(x),axis=1)
 print(type(code_df))
 print(code_df)
 
-code_df.to_csv('test_with_blob.csv')
+code_df.to_csv('1test_with_blob.csv')
+exit()
 
 print('startng to generate summary')
 code_df["summary"] = code_df.blob.apply(lambda x: generate_summary(x))
author	Nate Buttke <nate-web@riseup.net>	2023-07-27 17:48:51 -0700
committer	Nate Buttke <nate-web@riseup.net>	2023-07-27 17:48:51 -0700
commit	9d34c71ac0aa68d373a57505c4a9d763476cac7a (patch)
tree	357ed7746cccd2c99903d1b1636c8058afc5d622
parent	db824e067d17eba3469a49dffb04566aed3449b2 (diff)