initial commit

4 years ago · 92e122297f
commit 92e122297f
12 changed files with 505 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.venv/
+src/__pycache__/
--- a/README.md
+++ b/README.md
@ -0,0 +1,15 @@
+# Requirements
+### Minimal versions
+python 3.7
+postgresql 11
+
+### Database schema
+includes GIN index column `geodata_search_idx`
+```sql
+ALTER TABLE geodata ADD COLUMN geodata_search_ts TSVECTOR
+GENERATED ALWAYS AS (to_tsvector('russian', geodata.description) STORED
+```
+(use coalese() if multiple)
+```sql
+CREATE INDEX geodata_search_idx ON geodata USING GIN (geodata_search_ts);
+```
--- a/geodata-catalog.code-workspace
+++ b/geodata-catalog.code-workspace
@ -0,0 +1,15 @@
+{
+	"folders": [
+		{
+			"path": "."
+		}
+	],
+	"settings": {
+		"python.defaultInterpreterPath": ".venv/bin/python",
+		"python.formatting.blackPath": "/usr/bin/black",
+		"python.linting.flake8Path": "/usr/bin/flake8",
+		"python.linting.flake8Args": [
+			"--black-config=''"
+		]
+	}
+}
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -0,0 +1,18 @@
+anyio==3.5.0
+asgiref==3.5.0
+click==8.0.4
+et-xmlfile==1.1.0
+fastapi==0.74.1
+greenlet==1.1.2
+h11==0.13.0
+idna==3.3
+openpyxl==3.0.9
+psycopg2==2.9.3
+pydantic==1.9.0
+python-multipart==0.0.5
+six==1.16.0
+sniffio==1.2.0
+SQLAlchemy==1.4.31
+starlette==0.17.1
+typing_extensions==4.1.1
+uvicorn==0.17.5
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+anyio==3.5.0
+et-xmlfile==1.1.0
+fastapi==0.74.1
+greenlet==1.1.2
+idna==3.3
+openpyxl==3.0.9
+psycopg2==2.9.3
+pydantic==1.9.0
+python-multipart==0.0.5
+six==1.16.0
+sniffio==1.2.0
+SQLAlchemy==1.4.31
+starlette==0.17.1
+typing_extensions==4.1.1
--- a/src/init.py
+++ b/src/init.py
--- a/src/crud.py
+++ b/src/crud.py
@ -0,0 +1,62 @@
+from typing import List
+
+from sqlalchemy.orm import Session
+from sqlalchemy.sql.expression import func
+from sqlalchemy import inspect
+
+from . import models, schemas
+from .database import Base
+
+
+def get_item(db: Session, item_id: int):
+    """индивидуальные страницы для каждого Описания набора данных"""
+    return db.query(models.Item).filter(models.Item.id == item_id).first()
+
+
+def get_item_by_description(db: Session, needle: str, skip: int = 0, limit: int = 20):
+    """фильтры для поиска Описаний наборов данных по атрибутам
+    дополнительный возможный синтаксис в запросах преобразуется
+    в поисковый запрос содержащий:
+    - простой текст: переданные слова в любом порядке
+    - "текст в кавычках": переданные слова в указанном порядке
+    - OR: какие-либо из переданных слов
+    - -: не содержащий данного слова
+    """
+    # hardcode russian for now
+    # built-in postgress websearch_to_tsquery() is good
+    # https://www.postgresql.org/docs/11/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
+    # TODO: add requested search columns [0]
+    items_table = inspect(models.Item).local_table
+    result = (
+        db.query(models.Item)
+        .filter(
+            items_table.c.ts.op("@@")(func.websearch_to_tsquery('"russian"', needle))
+        )
+        .order_by(items_table.c.id)
+        .offset(skip)
+        .limit(limit)
+        .all()
+    )
+    return result
+
+
+def get_items(db: Session, skip: int = 0, limit: int = 20):
+    """список доступных в системе Описаний наборов данных"""
+    return db.query(models.Item).offset(skip).limit(limit).all()
+
+
+def insert_items(db: Session, items: List[schemas.ItemCreate]):
+    """импорт в базу данных Описаний наборов данных"""
+    before = db.query(models.Item).count()
+    accepted = len(items)
+    # https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-fast-execution-helpers
+    modeled_items = (models.ItemCreate(**item.dict()) for item in items)
+    db.add_all(modeled_items)
+    db.commit()
+    inserted = db.query(models.Item).count() - before
+    return (accepted, inserted)
+
+
+def get_headers(db: Session):
+    """полные заголовки таблиц"""
+    return db.query(models.Header).all()
--- a/src/database.py
+++ b/src/database.py
@ -0,0 +1,18 @@
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+
+# Database connection parameters
+SQALCHEMY_DATABASE_URL = "postgresql+psycopg2://geodata:PASSWORD@postgres/geodata"
+
+
+engine = create_engine(
+    SQALCHEMY_DATABASE_URL,
+    # https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-connect-arguments
+    executemany_mode="values_only",
+    executemany_values_page_size=10000,
+    executemany_batch_page_size=500,
+)
+SessionLocal = sessionmaker(autocommit=False, bind=engine)
+
+Base = declarative_base()
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,139 @@
+from datetime import datetime
+from re import IGNORECASE, sub as substitute
+from typing import List
+from urllib.parse import unquote_plus
+
+from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
+from sqlalchemy.orm import Session
+
+from . import crud, models, schemas, spreadsheet
+from .database import SessionLocal, engine
+
+
+# TODO: switch for Alembic [2]
+models.Base.metadata.create_all(bind=engine)
+
+app = FastAPI()
+
+# Dependency
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+@app.post(
+    "/items/",
+    responses={
+        422: {
+            "description": "Загружен некорректный файл (ожидался .xlsx). "
+            "Неизвестный заголовок. Или обнаружены данные в столбце без заголовка. "
+            "Необходимо использовать файлы установленного образца."
+        },
+    },
+    response_model=schemas.InsertStatus,
+)
+def create_items(file: UploadFile = File(...), db: Session = Depends(get_db)):
+    """импорт в базу данных Описаний наборов данных из Excel файлов"""
+    try:
+        start = datetime.now()
+        # parse spreadsheet into a collection of header and data iterators
+        # TODO: Multiple worksheets per Workbook or one? [3]
+        # assume one for now (second one have strange formatting)
+        with spreadsheet.parse(file=file) as spreadsheet_parse:
+            sheet = spreadsheet_parse[0]
+            # TODO use fullfledged validation framework? [6]
+            # ad-hoc for now:
+            # fetch known headersfrom the database
+            # NB: we never check the actual order of the columns, so we might suffer for it
+
+            # we have to construct a collection of known headers anyway,
+            # might as well make it a set
+            unknown_headers = set(sheet["header"]) - set(
+                dbh.spreadsheet for dbh in crud.get_headers(db=db)
+            )
+            if unknown_headers:
+                raise HTTPException(
+                    status_code=422,
+                    detail="Unknown headers in the spreadsheet: {}. Check the coordinated format".format(
+                        unknown_headers
+                    ),
+                )
+
+            # construct a list of schemas.Item items
+            spreadsheet_item_list = [
+                schemas.ItemCreate(
+                    **{
+                        key: row[i]
+                        for i, key in enumerate(schemas.ItemCreate.__fields__.keys())
+                    }
+                )
+                for row in sheet["data"]
+            ]
+            # dump all the data into database
+            accepted, processed = crud.insert_items(db=db, items=spreadsheet_item_list)
+
+    except spreadsheet.InvalidFileException:
+        raise HTTPException(
+            status_code=422, detail="Invalid file upload (expected .xlsx)"
+        )
+    except spreadsheet.DataInUnnamedColumnException:
+        raise HTTPException(
+            status_code=422, detail="Data is found in a column with empty header"
+        )
+
+    return schemas.InsertStatus(
+        status="Success" if accepted == processed else "Warning",
+        accepted=accepted,
+        processed=processed,
+        process_time=datetime.now() - start,
+    )
+
+
+@app.get(
+    "/item/{item_id}",
+    response_model=schemas.Item,
+    responses={404: {"description": "Такой набор данных отсутствует"}},
+)
+def read_item(item_id: int, db: Session = Depends(get_db)):
+    """индивидуальные страницы для каждого Описания набора данных"""
+    db_item = crud.get_item(db=db, item_id=item_id)
+    if db_item is None:
+        raise HTTPException(status_code=404, detail="Item not found")
+    return db_item
+
+
+@app.get("/items/", response_model=List[schemas.Item])
+def read_items(skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
+    """список доступных в системе Описаний наборов данных"""
+    return crud.get_items(db=db, skip=skip, limit=limit)
+
+
+@app.get(
+    "/search/",
+    response_model=List[schemas.Item],
+    responses={400: {"description": "Запрос слишком короткий (минимумл 3 символа)"}},
+)
+def search(q: str, skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
+    """фильтры для поиска Описаний наборов данных по атрибутам
+    дополнительный возможный синтаксис в запросах преобразуется
+    в поисковый запрос содержащий:
+    - простой текст: переданные слова в любом порядке
+    - "текст в кавычках": переданные слова в указанном порядке
+    - OR ИЛИ: какие-либо из переданных слов
+    - -: не содержащий данного слова
+    """
+    q = unquote_plus(q)
+    # replace all full ИЛИ words with OR
+    q = substitute(r"\bИЛИ\b", "OR", q, flags=IGNORECASE)
+    if len(q) < 3:
+        raise HTTPException(status_code=400, detail="Query too short")
+    return crud.get_item_by_description(db=db, needle=q, skip=skip, limit=limit)
+
+
+@app.get("/headers/", response_model=List[schemas.Header])
+def read_headers(db: Session = Depends(get_db)):
+    """полные наименования столбцов таблиц"""
+    return crud.get_headers(db=db)
--- a/src/models.py
+++ b/src/models.py
@ -0,0 +1,50 @@
+from sqlalchemy import Column, Integer, String, DateTime
+
+# we might need vector concat later, then we'll have to bring in sqlalchemy_utils
+# https://sqlalchemy-utils.readthedocs.io/en/latest/_modules/sqlalchemy_utils/types/ts_vector.html
+from sqlalchemy.dialects.postgresql import TSVECTOR
+
+from .database import Base
+
+
+class Header(Base):
+    __tablename__ = "headers"
+
+    database = Column(String, primary_key=True)
+    spreadsheet = Column(String)
+
+
+class ItemBase(Base):
+    __tablename__ = "geodata"
+
+    id = Column(Integer, primary_key=True, index=True)
+    fadr = Column(String)
+    oldid = Column(String)
+    category = Column(String)
+    basin = Column(String)
+    deposit = Column(String)
+    well = Column(String)
+    depth = Column(String)
+    stratum = Column(String)
+    owner = Column(String)
+    org = Column(String)
+    ownercontacts = Column(String)
+    samplelist = Column(String)
+    description = Column(String)
+    formdimentions = Column(String)
+    datalist = Column(String)
+    resolution = Column(String)
+    date = Column(String)
+    additionalinfo = Column(String)
+    scanner = Column(String)
+    comment = Column(String)
+    continuation = Column(String)
+
+
+class ItemCreate(ItemBase):
+    pass
+
+
+class Item(ItemBase):
+
+    geodata_search_ts = Column(TSVECTOR)
--- a/src/schemas.py
+++ b/src/schemas.py
@ -0,0 +1,69 @@
+from datetime import datetime, timedelta
+from typing import Optional, Union
+
+from pydantic import BaseModel
+
+
+class HeaderBase(BaseModel):
+    database: str
+    spreadsheet: str
+
+
+class HeaderCreate(HeaderBase):
+    pass
+
+
+class Header(HeaderBase):
+    """Заголовок описания набора геологических данных
+    по результатам экспедиционных исследований"""
+
+    class Config:
+        orm_mode = True
+
+
+class ItemBase(BaseModel):
+    """Описание набора геологических данных
+    по результатам экспедиционных исследований
+    """
+
+    fadr: Optional[str] = None
+    oldid: Optional[str] = None
+    category: Optional[str] = None
+    basin: Optional[str] = None
+    deposit: Optional[str] = None
+    well: Optional[str] = None
+    depth: Optional[str] = None
+    stratum: Optional[str] = None
+    owner: Optional[str] = None
+    org: Optional[str] = None
+    ownercontacts: Optional[str] = None
+    samplelist: Optional[str] = None
+    description: Optional[str] = None
+    formdimentions: Optional[str] = None
+    datalist: Optional[str] = None
+    resolution: Optional[str] = None
+    date: Optional[Union[datetime, str]] = None
+    additionalinfo: Optional[str] = None
+    scanner: Optional[str] = None
+    comment: Optional[str] = None
+    continuation: Optional[str] = None
+
+
+class ItemCreate(ItemBase):
+    pass
+
+
+class Item(ItemBase):
+    id: int
+
+    class Config:
+        orm_mode = True
+
+
+class InsertStatus(BaseModel):
+    """basic insertation status metrics"""
+
+    status: str
+    accepted: int
+    processed: int
+    process_time: timedelta
--- a/src/spreadsheet.py
+++ b/src/spreadsheet.py
@ -0,0 +1,103 @@
+from openpyxl import Workbook, load_workbook
+from openpyxl.utils.exceptions import InvalidFileException
+from fastapi import File, UploadFile
+
+from asyncio import run as asyncio_run
+from contextlib import closing, contextmanager
+from itertools import chain
+from os import remove
+from pathlib import Path
+from shutil import copyfileobj
+from tempfile import NamedTemporaryFile
+
+
+class DataInUnnamedColumnException(Exception):
+    """all the columns containing any data have to me named"""
+
+    pass
+
+
+@contextmanager
+def parse(file: UploadFile = File(...)):
+    """returns a dict with a pair of iterators
+    for each sheet in the spreadsheet in a list
+
+    [
+        {"header": header_iterator,"data": data_iterator}, # sheet1
+        {"header": header_iterator,"data": data_iterator},  # sheet2
+        # etc
+    ]
+    """
+    # prepare return list
+    result = []
+
+    suffix = Path(file.filename).suffix
+    try:
+        # TODO: decide if we use subdir in /tmp here, then create it [5]
+        with NamedTemporaryFile(delete=False, suffix=suffix, dir="/tmp/notgeo") as tmp:
+            copyfileobj(file.file, tmp)
+            spreadsheet_file = Path(tmp.name)
+
+        # Unlike a normal workbook, a read-only workbook will use lazy loading.
+        # The workbook must be explicitly closed with the close() method.
+
+        # we use contextlib.closing() to do it for us
+        with closing(load_workbook(spreadsheet_file, read_only=True)) as wb:
+            # TODO: Multiple worksheets per Workbook or one? [3]
+            # assume one for now (second one have strange formatting)
+            ws = wb.active
+            # assume headers are in the top row
+            header = [
+                cell
+                for row in ws.iter_rows(max_row=1, values_only=True)
+                for cell in row
+            ]
+
+            # assume data stretch is continuous
+            # find first occurence of None in header and assert that
+            # no unnamed column contains any data
+            # .index is 0-based and min_col= is 1-based so we want to
+            # go for header.index(None)+1
+            # unpack row generator into separate cell iterators, chain them together,
+            # and make sure none of the cells contains any data
+            try:
+                last_column_with_header = header.index(None)
+                if any(
+                    chain(
+                        *(
+                            ws.iter_rows(
+                                min_col=last_column_with_header + 1, values_only=True
+                            )
+                        )
+                    )
+                ):
+                    raise DataInUnnamedColumnException(
+                        "Data is found in a column with empty header"
+                    )
+            except ValueError:
+                # header.index(None) couldn't find anything,
+                # all cell in the first row are filled,
+                # so all columns have headers,
+                # we can safely continue
+                last_column_with_header = len(header)
+
+            # Construct spreadsheet data iterator
+            # .index is 0-based and max_col= is 1-based so we might've wanted to
+            # go for header.index(None)+1, but max_col= range is including
+            # and we want to only include non-empty columns, and go for
+            # the previous one (-1) so in the end
+            #  max_col=header.index(None)+1-1
+            data = ws.iter_rows(
+                min_row=2, max_col=last_column_with_header, values_only=True
+            )
+            result.append(
+                {"header": (cell for cell in header if cell is not None), "data": data}
+            )
+            # END [3]
+
+            yield result
+    finally:
+        # clean up by explicitly closing the files and removing temporary spreadsheet
+        asyncio_run(file.close())
+        tmp.close()
+        remove(spreadsheet_file)