ai_english/data/import_primary_words.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""导入小学英语核心词汇到数据库"""

import pandas as pd
import mysql.connector
from datetime import datetime
import uuid

# 数据库配置
db_config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'JKjk20011115',
    'database': 'ai_english_learning',
    'charset': 'utf8mb4'
}

# 词汇书ID
BOOK_ID = 'primary_core_1000'

def generate_uuid():
    """生成UUID"""
    return str(uuid.uuid4())

def import_words_from_excel(file_path):
    """从Excel导入单词"""
    try:
        # 读取Excel文件
        print(f"📖 正在读取文件: {file_path}")
        df = pd.read_excel(file_path)

        print(f"📊 文件列名: {df.columns.tolist()}")
        print(f"📊 总行数: {len(df)}")
        print(f"\n前5行数据预览:")
        print(df.head())

        # 连接数据库
        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor()

        # 准备SQL语句
        insert_vocab_sql = """
        INSERT INTO ai_vocabulary
        (word, phonetic, level, frequency, is_active, created_at, updated_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE
        id = LAST_INSERT_ID(id),
        phonetic = VALUES(phonetic),
        level = VALUES(level),
        frequency = VALUES(frequency),
        updated_at = VALUES(updated_at)
        """

        insert_definition_sql = """
        INSERT INTO ai_vocabulary_definitions
        (vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s, %s)
        """

        insert_example_sql = """
        INSERT INTO ai_vocabulary_examples
        (vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s)
        """

        insert_book_word_sql = """
        INSERT INTO ai_vocabulary_book_words
        (book_id, vocabulary_id, sort_order, created_at)
        VALUES (%s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE sort_order = VALUES(sort_order)
        """

        success_count = 0
        error_count = 0

        # 遍历每一行
        for index, row in df.iterrows():
            try:
                # 提取数据（根据实际Excel列名调整）
                word = str(row.get('Word', '')).strip()
                if not word or word == 'nan':
                    continue

                # 优先使用美式音标
                phonetic = str(row.get('美式音标', '')).strip()
                if phonetic == 'nan' or not phonetic:
                    phonetic = str(row.get('英式音标', '')).strip()
                if phonetic == 'nan':
                    phonetic = None

                translation = str(row.get('中文含义', '')).strip()
                if translation == 'nan':
                    translation = ''

                # 从中文含义中提取词性（如果有的话）
                part_of_speech = 'noun'  # 默认为名词
                if translation:
                    if 'v.' in translation or '动' in translation:
                        part_of_speech = 'verb'
                    elif 'adj.' in translation or '形' in translation:
                        part_of_speech = 'adjective'
                    elif 'adv.' in translation or '副' in translation:
                        part_of_speech = 'adverb'
                    elif 'prep.' in translation or '介' in translation:
                        part_of_speech = 'preposition'
                    elif 'conj.' in translation or '连' in translation:
                        part_of_speech = 'conjunction'

                example_en = str(row.get('例句', '')).strip()
                if example_en == 'nan' or not example_en:
                    example_en = None

                example_cn = str(row.get('例句中文翻译', '')).strip()
                if example_cn == 'nan' or not example_cn:
                    example_cn = None

                # 插入词汇
                now = datetime.now()
                cursor.execute(insert_vocab_sql, (
                    word,
                    phonetic,
                    'beginner',  # 小学词汇难度为beginner
                    index + 1,   # 使用行号作为频率
                    True,
                    now,
                    now
                ))

                # 获取插入的ID
                vocab_id = cursor.lastrowid

                # 插入释义
                if translation:
                    cursor.execute(insert_definition_sql, (
                        vocab_id,
                        part_of_speech,
                        word,  # 英文定义暂时用单词本身
                        translation,
                        0,
                        now
                    ))

                # 插入例句（只取第一个例句）
                if example_en and example_cn:
                    # 如果有多个例句，用分号分隔，只取第一个
                    first_example_en = example_en.split('；')[0] if '；' in example_en else example_en
                    first_example_cn = example_cn.split('；')[0] if '；' in example_cn else example_cn

                    cursor.execute(insert_example_sql, (
                        vocab_id,
                        first_example_en,
                        first_example_cn,
                        0,
                        now
                    ))

                # 关联到词汇书
                cursor.execute(insert_book_word_sql, (
                    BOOK_ID,
                    vocab_id,
                    index,
                    now
                ))

                success_count += 1
                if success_count % 50 == 0:
                    print(f"✅ 已导入 {success_count} 个单词...")
                    conn.commit()

            except Exception as e:
                error_count += 1
                print(f"❌ 导入第 {index + 1} 行失败: {e}")
                print(f"   数据: {row.to_dict()}")

        # 提交事务
        conn.commit()

        # 更新词汇书的总单词数
        cursor.execute(
            "UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
            (success_count, BOOK_ID)
        )
        conn.commit()

        print(f"\n🎉 导入完成!")
        print(f"✅ 成功: {success_count} 个单词")
        print(f"❌ 失败: {error_count} 个单词")

        # 验证数据
        cursor.execute(
            "SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
            (BOOK_ID,)
        )
        count = cursor.fetchone()[0]
        print(f"📊 词汇书中共有 {count} 个单词")

    except Exception as e:
        print(f"❌ 导入失败: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

if __name__ == '__main__':
    import_words_from_excel('data/小学.xlsx')