init

2025-11-17 13:39:05 +08:00
commit d4cfe2b9de
479 changed files with 109324 additions and 0 deletions
--- a/data/import_primary_words.py
+++ b/data/import_primary_words.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""导入小学英语核心词汇到数据库"""
+
+import pandas as pd
+import mysql.connector
+from datetime import datetime
+import uuid
+
+# 数据库配置
+db_config = {
+    'host': 'localhost',
+    'port': 3306,
+    'user': 'root',
+    'password': 'JKjk20011115',
+    'database': 'ai_english_learning',
+    'charset': 'utf8mb4'
+}
+
+# 词汇书ID
+BOOK_ID = 'primary_core_1000'
+
+def generate_uuid():
+    """生成UUID"""
+    return str(uuid.uuid4())
+
+def import_words_from_excel(file_path):
+    """从Excel导入单词"""
+    try:
+        # 读取Excel文件
+        print(f"📖 正在读取文件: {file_path}")
+        df = pd.read_excel(file_path)
+        
+        print(f"📊 文件列名: {df.columns.tolist()}")
+        print(f"📊 总行数: {len(df)}")
+        print(f"\n前5行数据预览:")
+        print(df.head())
+        
+        # 连接数据库
+        conn = mysql.connector.connect(**db_config)
+        cursor = conn.cursor()
+        
+        # 准备SQL语句
+        insert_vocab_sql = """
+        INSERT INTO ai_vocabulary 
+        (word, phonetic, level, frequency, is_active, created_at, updated_at)
+        VALUES (%s, %s, %s, %s, %s, %s, %s)
+        ON DUPLICATE KEY UPDATE
+        id = LAST_INSERT_ID(id),
+        phonetic = VALUES(phonetic),
+        level = VALUES(level),
+        frequency = VALUES(frequency),
+        updated_at = VALUES(updated_at)
+        """
+        
+        insert_definition_sql = """
+        INSERT INTO ai_vocabulary_definitions
+        (vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
+        VALUES (%s, %s, %s, %s, %s, %s)
+        """
+        
+        insert_example_sql = """
+        INSERT INTO ai_vocabulary_examples
+        (vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
+        VALUES (%s, %s, %s, %s, %s)
+        """
+        
+        insert_book_word_sql = """
+        INSERT INTO ai_vocabulary_book_words
+        (book_id, vocabulary_id, sort_order, created_at)
+        VALUES (%s, %s, %s, %s)
+        ON DUPLICATE KEY UPDATE sort_order = VALUES(sort_order)
+        """
+        
+        success_count = 0
+        error_count = 0
+        
+        # 遍历每一行
+        for index, row in df.iterrows():
+            try:
+                # 提取数据（根据实际Excel列名调整）
+                word = str(row.get('Word', '')).strip()
+                if not word or word == 'nan':
+                    continue
+                
+                # 优先使用美式音标
+                phonetic = str(row.get('美式音标', '')).strip()
+                if phonetic == 'nan' or not phonetic:
+                    phonetic = str(row.get('英式音标', '')).strip()
+                if phonetic == 'nan':
+                    phonetic = None
+                
+                translation = str(row.get('中文含义', '')).strip()
+                if translation == 'nan':
+                    translation = ''
+                
+                # 从中文含义中提取词性（如果有的话）
+                part_of_speech = 'noun'  # 默认为名词
+                if translation:
+                    if 'v.' in translation or '动' in translation:
+                        part_of_speech = 'verb'
+                    elif 'adj.' in translation or '形' in translation:
+                        part_of_speech = 'adjective'
+                    elif 'adv.' in translation or '副' in translation:
+                        part_of_speech = 'adverb'
+                    elif 'prep.' in translation or '介' in translation:
+                        part_of_speech = 'preposition'
+                    elif 'conj.' in translation or '连' in translation:
+                        part_of_speech = 'conjunction'
+                
+                example_en = str(row.get('例句', '')).strip()
+                if example_en == 'nan' or not example_en:
+                    example_en = None
+                
+                example_cn = str(row.get('例句中文翻译', '')).strip()
+                if example_cn == 'nan' or not example_cn:
+                    example_cn = None
+                
+                # 插入词汇
+                now = datetime.now()
+                cursor.execute(insert_vocab_sql, (
+                    word,
+                    phonetic,
+                    'beginner',  # 小学词汇难度为beginner
+                    index + 1,   # 使用行号作为频率
+                    True,
+                    now,
+                    now
+                ))
+                
+                # 获取插入的ID
+                vocab_id = cursor.lastrowid
+                
+                # 插入释义
+                if translation:
+                    cursor.execute(insert_definition_sql, (
+                        vocab_id,
+                        part_of_speech,
+                        word,  # 英文定义暂时用单词本身
+                        translation,
+                        0,
+                        now
+                    ))
+                
+                # 插入例句（只取第一个例句）
+                if example_en and example_cn:
+                    # 如果有多个例句，用分号分隔，只取第一个
+                    first_example_en = example_en.split('；')[0] if '；' in example_en else example_en
+                    first_example_cn = example_cn.split('；')[0] if '；' in example_cn else example_cn
+                    
+                    cursor.execute(insert_example_sql, (
+                        vocab_id,
+                        first_example_en,
+                        first_example_cn,
+                        0,
+                        now
+                    ))
+                
+                # 关联到词汇书
+                cursor.execute(insert_book_word_sql, (
+                    BOOK_ID,
+                    vocab_id,
+                    index,
+                    now
+                ))
+                
+                success_count += 1
+                if success_count % 50 == 0:
+                    print(f"✅ 已导入 {success_count} 个单词...")
+                    conn.commit()
+                
+            except Exception as e:
+                error_count += 1
+                print(f"❌ 导入第 {index + 1} 行失败: {e}")
+                print(f"   数据: {row.to_dict()}")
+        
+        # 提交事务
+        conn.commit()
+        
+        # 更新词汇书的总单词数
+        cursor.execute(
+            "UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
+            (success_count, BOOK_ID)
+        )
+        conn.commit()
+        
+        print(f"\n🎉 导入完成!")
+        print(f"✅ 成功: {success_count} 个单词")
+        print(f"❌ 失败: {error_count} 个单词")
+        
+        # 验证数据
+        cursor.execute(
+            "SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
+            (BOOK_ID,)
+        )
+        count = cursor.fetchone()[0]
+        print(f"📊 词汇书中共有 {count} 个单词")
+        
+    except Exception as e:
+        print(f"❌ 导入失败: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        if cursor:
+            cursor.close()
+        if conn:
+            conn.close()
+
+if __name__ == '__main__':
+    import_words_from_excel('data/小学.xlsx')