diff --git a/scripts/generate-index-for-system-varibles.py b/scripts/generate-index-for-system-varibles.py new file mode 100644 index 0000000000000..e48a47e3226e0 --- /dev/null +++ b/scripts/generate-index-for-system-varibles.py @@ -0,0 +1,218 @@ +# This script enables you to update the "## Variable reference" section in `system-variable-reference.md` automatically according to the latest reference information of all system variables in TiDB documentation. +# Before running this script, you need to specify the local directory of the TiDB source documentation in the `docs_dir` variable. +# If a system variable name contains `_`, the script will add all references to this variable name to the reference index, except when the variable name is surrounded by `_` or `-` or other English letters in the referenced docs. +# If a system variable name does not contain `_`, the script will only count the link reference of this variable as valid references and add the valid references to the reference index. + +import re +from pathlib import Path +import os + +docs_dir = "/Users/grcai/Documents/GitHub/docs-cn" # Specify the local directory of the TiDB source documentation, which can be either English or Chinese +reference_section_titles = ["Variable reference", "变量索引"] +referenced_in_text = ["Referenced in:", "引用该变量的文档:"] + +def get_md_files_in_toc(toc_file): + with open(toc_file, 'r', encoding='utf-8') as f: + content = f.read() + return re.findall(r'\[.*?\]\(/(?:[^)]+/)?([^/]+\.md)(?:#[^)]*)?\)', content) + +def generate_var_link(variable_line): + # Remove tags but keep the content inside span + variable_line = re.sub(r'\s*?(.*?)\s*?', r' \1', variable_line).strip() + # Remove backticks + variable_line = re.sub(r'`', '', variable_line).strip() + variable_line = variable_line.lstrip("#").strip().lower() # Remove leading "#" and trim spaces, convert to lowercase + variable_line = re.sub(r"[^\w\u4e00-\u9fff\s-]", "", variable_line) # Remove special symbols but keep spaces + variable_line = re.sub(r"\s+", "-", variable_line) # Replace spaces with hyphens + return variable_line + +def extract_variables(content): + variables = [] + for line in content.split('\n'): + if line.startswith('### '): + # Remove tags + clean_line = re.sub(r'', '', line).strip() + + # Extract variable name, match with or without backticks + var_name = re.sub(r'^### `(.*?)`|^### (.+)', r'\1\2', clean_line).strip() + + if var_name: + var_link = generate_var_link(line) # Variable name for generating links + variables.append((var_name, var_link)) + return variables + +def find_references(variable_name, var_link, docs_dir, reference_paths_to_be_checked): + references = [] + link_to_find = f"(/system-variables.md#{var_link})" + + for path in reference_paths_to_be_checked: + try: + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + if link_to_find in content: + title = extract_doc_title(content, path) + rel_path = str(path.relative_to(docs_dir)) + references.append((title, rel_path)) + else: + if "_" in variable_name and variable_name in content: + if re.search(rf'(? tags + updated_content = re.sub(r'\[([^]]*?\s*?\s*?[^]]*)\]\(([^)]*)\)', lambda m: f'[{re.sub(r"\s*?\s*?", "", m.group(1))}]({m.group(2)})', updated_content) + + updated_content = re.sub(r"What's New in TiDB 5.0", "TiDB 5.0 Release Notes", updated_content) + + # Replace the last two empty lines with one empty line + updated_content = re.sub(r"\n\n$", "\n", updated_content) + + with open(ref_file_path, 'w', encoding='utf-8') as f: + f.write(updated_content) + +def main(): + global doc_md_list + + if doc_language == "en": + doc_md_list = list(set(get_md_files_in_toc(tidb_file) + get_md_files_in_toc(tidb_cloud_file))) + else: + doc_md_list = list(set(get_md_files_in_toc(tidb_file))) + + print("Start to generate reference information for system variables. It will take a few seconds...") + + try: + # Read system variables + with open(variables_file_path, 'r', encoding='utf-8') as f: + variables = extract_variables(f.read()) + # Generate new reference content + new_content = generate_reference_content(variables, docs_dir) + # Update reference file + update_reference_file_path(reference_file_path, new_content) + print("Reference file updated successfully!") + except Exception as e: + print(f"An error occurred: {e}") + +if __name__ == "__main__": + + variables_file = "system-variables.md" + reference_file = "system-variable-reference.md" + tidb_toc_file = "TOC.md" + tidb_cloud_toc_file = "TOC-tidb-cloud.md" + cloud_docs_dir = "https://docs.pingcap.com/tidbcloud" + variables_file_path = os.path.abspath(os.path.join(docs_dir, variables_file)) + reference_file_path = os.path.abspath(os.path.join(docs_dir, reference_file)) + tidb_file = os.path.abspath(os.path.join(docs_dir, tidb_toc_file)) + tidb_cloud_file = os.path.abspath(os.path.join(docs_dir, tidb_cloud_toc_file)) + + with open(variables_file_path, 'r', encoding='utf-8') as f: + variables_file_title = extract_doc_title(f.read(), variables_file_path) + + if not re.search(r'[\u4e00-\u9fff]', variables_file_title): # Check if title contains no Chinese characters + doc_language = "en" + referenced_in = referenced_in_text[0] + reference_section_title = reference_section_titles[0] + else: + doc_language = "zh" + referenced_in = referenced_in_text[1] + reference_section_title = reference_section_titles[1] + + main()