ai_english/data/import_primary_words_complete.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""完整导入小学英语核心词汇到数据库（包含所有字段）"""

import pandas as pd
import mysql.connector
from datetime import datetime
import re

# 数据库配置
db_config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'JKjk20011115',
    'database': 'ai_english_learning',
    'charset': 'utf8mb4'
}

# 词汇书ID
BOOK_ID = 'primary_core_1000'

def clean_text(text):
    """清理文本，处理nan和空值"""
    if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
        return None
    return str(text).strip()

def extract_part_of_speech(translation):
    """从中文翻译中提取词性"""
    if not translation:
        return 'noun'

    pos_map = {
        'v.': 'verb',
        'n.': 'noun',
        'adj.': 'adjective',
        'adv.': 'adverb',
        'prep.': 'preposition',
        'conj.': 'conjunction',
        'pron.': 'pronoun',
        'interj.': 'interjection'
    }

    for abbr, full in pos_map.items():
        if abbr in translation or abbr.replace('.', '') in translation:
            return full

    # 中文词性判断
    if '动' in translation:
        return 'verb'
    elif '形' in translation or '容' in translation:
        return 'adjective'
    elif '副' in translation:
        return 'adverb'
    elif '介' in translation:
        return 'preposition'
    elif '连' in translation:
        return 'conjunction'

    return 'noun'  # 默认名词

def import_words_from_excel(file_path):
    """从Excel导入单词"""
    try:
        # 读取Excel文件
        print(f"📖 正在读取文件: {file_path}")
        df = pd.read_excel(file_path)

        print(f"📊 文件列名: {df.columns.tolist()}")
        print(f"📊 总行数: {len(df)}")

        # 连接数据库
        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor()

        # 先清空旧数据
        print("\n清理旧数据...")
        cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
        cursor.execute("""
            DELETE v FROM ai_vocabulary v
            LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
            WHERE bw.id IS NULL
        """)
        conn.commit()

        # 准备SQL语句
        insert_vocab_sql = """
        INSERT INTO ai_vocabulary
        (word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
         word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """

        insert_definition_sql = """
        INSERT INTO ai_vocabulary_definitions
        (vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s, %s)
        """

        insert_example_sql = """
        INSERT INTO ai_vocabulary_examples
        (vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
        VALUES (%s, %s, %s, %s, %s)
        """

        insert_book_word_sql = """
        INSERT INTO ai_vocabulary_book_words
        (book_id, vocabulary_id, sort_order, created_at)
        VALUES (%s, %s, %s, %s)
        """

        success_count = 0
        error_count = 0

        # 遍历每一行
        for index, row in df.iterrows():
            try:
                # 提取基本数据
                word = clean_text(row.get('Word'))
                if not word:
                    continue

                # 音标
                phonetic_us = clean_text(row.get('美式音标'))
                phonetic_uk = clean_text(row.get('英式音标'))
                phonetic = phonetic_us or phonetic_uk

                # 释义
                translation_cn = clean_text(row.get('中文含义'))
                translation_en = clean_text(row.get('英文翻译（对应中文含义）'))

                if not translation_cn:
                    print(f"⚠️  跳过 {word}：缺少中文含义")
                    continue

                # 如果没有英文翻译，使用单词本身
                if not translation_en:
                    translation_en = word

                # 提取词性
                part_of_speech = extract_part_of_speech(translation_cn)

                # 例句
                example_en = clean_text(row.get('例句'))
                example_cn = clean_text(row.get('例句中文翻译'))

                # 词根
                word_root = clean_text(row.get('词根'))

                # 同义词（处理为JSON）
                synonyms_text = clean_text(row.get('同义词（含义）'))
                synonyms_json = '[]'
                if synonyms_text:
                    # 分号分隔，格式：word1（含义1）；word2（含义2）
                    import json
                    syn_list = []
                    for syn in synonyms_text.split('；'):
                        syn = syn.strip()
                        if syn:
                            syn_list.append(syn)
                    synonyms_json = json.dumps(syn_list, ensure_ascii=False)

                # 反义词（处理为JSON）
                antonyms_text = clean_text(row.get('反义词（含义）'))
                antonyms_json = '[]'
                if antonyms_text:
                    import json
                    ant_list = []
                    for ant in antonyms_text.split('；'):
                        ant = ant.strip()
                        if ant:
                            ant_list.append(ant)
                    antonyms_json = json.dumps(ant_list, ensure_ascii=False)

                # 派生词（处理为JSON）
                derivatives_text = clean_text(row.get('派生词（含义）'))
                derivatives_json = '[]'
                if derivatives_text:
                    import json
                    der_list = []
                    for der in derivatives_text.split('；'):
                        der = der.strip()
                        if der:
                            der_list.append(der)
                    derivatives_json = json.dumps(der_list, ensure_ascii=False)

                # 词组搭配（处理为JSON）
                phrases_text = clean_text(row.get('词组搭配（中文含义）'))
                collocations_json = '[]'
                if phrases_text:
                    import json
                    col_list = []
                    for phrase in phrases_text.split('；'):
                        phrase = phrase.strip()
                        if phrase:
                            col_list.append(phrase)
                    collocations_json = json.dumps(col_list, ensure_ascii=False)

                # 插入词汇
                now = datetime.now()
                cursor.execute(insert_vocab_sql, (
                    word,
                    phonetic_us,
                    phonetic_uk,
                    phonetic,
                    'beginner',
                    index + 1,
                    True,
                    word_root,
                    synonyms_json,
                    antonyms_json,
                    derivatives_json,
                    collocations_json,
                    now,
                    now
                ))

                vocab_id = cursor.lastrowid

                # 插入主要释义
                cursor.execute(insert_definition_sql, (
                    vocab_id,
                    part_of_speech,
                    translation_en,  # ✅ 使用正确的英文翻译
                    translation_cn,
                    0,
                    now
                ))

                # 插入例句
                if example_en and example_cn:
                    # 处理多个例句（用分号分隔）
                    examples_en = example_en.split('；')
                    examples_cn = example_cn.split('；')

                    for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
                        ex_en = ex_en.strip()
                        ex_cn = ex_cn.strip()
                        if ex_en and ex_cn:
                            cursor.execute(insert_example_sql, (
                                vocab_id,
                                ex_en,
                                ex_cn,
                                i,
                                now
                            ))

                # 关联到词汇书
                cursor.execute(insert_book_word_sql, (
                    BOOK_ID,
                    vocab_id,
                    index,
                    now
                ))

                success_count += 1
                if success_count % 50 == 0:
                    print(f"✅ 已导入 {success_count} 个单词...")
                    conn.commit()

            except Exception as e:
                error_count += 1
                print(f"❌ 导入第 {index + 1} 行失败: {e}")
                print(f"   单词: {word if 'word' in locals() else 'N/A'}")

        # 提交事务
        conn.commit()

        # 更新词汇书的总单词数
        cursor.execute(
            "UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
            (success_count, BOOK_ID)
        )
        conn.commit()

        print(f"\n🎉 导入完成!")
        print(f"✅ 成功: {success_count} 个单词")
        print(f"❌ 失败: {error_count} 个单词")

        # 验证数据
        cursor.execute(
            "SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
            (BOOK_ID,)
        )
        count = cursor.fetchone()[0]
        print(f"📊 词汇书中共有 {count} 个单词")

        # 检查释义数量
        cursor.execute("""
            SELECT COUNT(DISTINCT d.vocabulary_id)
            FROM ai_vocabulary_book_words bw
            JOIN ai_vocabulary_definitions d ON d.vocabulary_id = bw.vocabulary_id
            WHERE bw.book_id = %s
        """, (BOOK_ID,))
        def_count = cursor.fetchone()[0]
        print(f"📊 有释义的单词: {def_count} 个")

        # 检查例句数量
        cursor.execute("""
            SELECT COUNT(DISTINCT e.vocabulary_id)
            FROM ai_vocabulary_book_words bw
            JOIN ai_vocabulary_examples e ON e.vocabulary_id = bw.vocabulary_id
            WHERE bw.book_id = %s
        """, (BOOK_ID,))
        ex_count = cursor.fetchone()[0]
        print(f"📊 有例句的单词: {ex_count} 个")

    except Exception as e:
        print(f"❌ 导入失败: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

if __name__ == '__main__':
    import_words_from_excel('data/小学.xlsx')